Compare commits

...

335 Commits
1.1.2 ... 1.2.6

Author SHA1 Message Date
Balazs Gerofi
bf5ac7afc8 remote_flush_tlb_array_cpumask(): bundle remote TLB invalidations 2017-07-21 15:34:48 +09:00
Balazs Gerofi
bc423255d9 mcctrl/mcexec: limit thread pool size when too many threads exist on Linux 2017-07-21 15:33:19 +09:00
Balazs Gerofi
6714161c25 profile remote TLB invalidations 2017-07-20 22:28:25 +09:00
Balazs Gerofi
992a292c08 profile: better time breakdown and exclusion of idle cycles 2017-07-20 17:36:34 +09:00
Tomoki Shirasawa
64c2e437c6 open: check filename address (re-commit) 2017-07-19 11:37:55 +09:00
Balazs Gerofi
dd9675d65e NUMA: only print a short summary at boot time 2017-07-19 09:11:44 +09:00
Balazs Gerofi
51ed8dce06 numa_init(): fix rusage memory counting 2017-07-19 08:23:05 +09:00
Tomoki Shirasawa
01f5e46865 revert 2d7890731e 2017-07-18 12:13:48 +09:00
Masamichi Takagi
38961fca78 Revert "do_fork(): RLIMIT_NPROC check"
This reverts commit 035e7913d8.
2017-07-13 04:13:41 +09:00
Tomoki Shirasawa
2d7890731e add_process_memory_range: do not initialize page when did not present phys page 2017-07-18 00:45:18 +09:00
Tomoki Shirasawa
7d181fccd9 open: check filename address 2017-07-18 00:09:39 +09:00
Tomoki Shirasawa
bd75e80df2 terminate: fix to reference freed pointer 2017-07-17 19:32:08 +09:00
Masamichi Takagi
035e7913d8 do_fork(): RLIMIT_NPROC check
1. mcexec sets RLIMIT_NPROC to the number of mcexec threads.
2. do_fork() gets the current number of threads by calling rusage function.
3. do_fork() returns -EAGAIN when the limit is exceeded.
2017-07-12 20:42:38 +09:00
Tomoki Shirasawa
7d38c7c147 delete debug print 2017-07-14 10:13:22 +09:00
Tomoki Shirasawa
a801bcc591 delete rusage.c 2017-07-14 09:52:33 +09:00
Tomoki Shirasawa
d7b8e7f4f4 fix to count user pages
refs #864
2017-07-14 09:51:39 +09:00
Masamichi Takagi
6afea4af48 mcexec: Fix debug/error messages 2017-07-12 14:30:21 +09:00
Masamichi Takagi
6415dcfdcc mcexec: Disable address space layout randomization
Move the code from mcreboot.sh to mcexec.c.
2017-07-12 14:17:38 +09:00
Balazs Gerofi
0f58e9e77d NUMA: expose correct /sys/devices/system/node/nodeX/meminfo 2017-07-07 00:59:32 +09:00
Masamichi Takagi
72e3f5ee50 ihk_mc_get_ikc_cpu(): Get IKC destination CPU 2017-07-11 20:20:40 +09:00
Masamichi Takagi
8d57ad9bc4 pmc_start, pmc_stop: Error check on counter number 2017-07-11 19:05:45 +09:00
Balazs Gerofi
35b36c2d33 move_pages_smp_handler(): more parallelization 2017-07-08 18:36:13 +09:00
Balazs Gerofi
632611d78c mbind(): debug msg 2017-07-08 18:36:13 +09:00
Balazs Gerofi
d48d44d365 move_pages(): fix barrier in parallel implementation 2017-07-08 18:36:13 +09:00
Balazs Gerofi
4c0f401424 move_pages(): parallel implementation v1 2017-07-08 18:36:05 +09:00
Balazs Gerofi
06f824c829 pte_update_phys(): update physical address of a PTE 2017-07-08 18:36:05 +09:00
Balazs Gerofi
7a606baad4 move_pages(): sequential implementation 2017-07-08 18:36:05 +09:00
Balazs Gerofi
4c6c66555e memset_smp(): parallel memset 2017-07-08 18:36:05 +09:00
Balazs Gerofi
8426cf589a ihk_pagealloc_free(): report double-free in bitmap based allocator 2017-07-08 18:36:05 +09:00
Balazs Gerofi
da7421e8ee memdebug: more detailed error report 2017-07-08 18:36:05 +09:00
Balazs Gerofi
209748d913 visit_pte_range(): visit L1 PTEs but don't free for MF_PREMAP files 2017-07-08 18:36:04 +09:00
Balazs Gerofi
f81722c63b __mckernel_free_pages_in_allocator(): fix deallocation of invalid physical range 2017-07-08 18:35:50 +09:00
Balazs Gerofi
2189c55d99 x86: ASM fast memset() 2017-07-08 18:26:51 +09:00
Balazs Gerofi
201a7e2595 Red-black tree based physical memory management 2017-07-08 18:26:51 +09:00
Balazs Gerofi
5cdd194856 Port Linux red-black trees 2017-07-08 18:12:01 +09:00
Tomoki Shirasawa
0061adadfb temporary fix for bug #889 2017-07-04 12:04:37 +09:00
Tomoki Shirasawa
67843151d3 fix how to count rss and num of threads
refs #864
refs #865
2017-07-03 16:27:46 +09:00
Tomoki Shirasawa
083cf3fcc9 rusage_max_memory is set sum of all memory chanks
refs #891
2017-07-03 14:49:35 +09:00
Tomoki Shirasawa
4236323661 add SCD_MSG_EVENT_SIGNAL
refs #862
2017-07-03 14:49:13 +09:00
Tomoki Shirasawa
5a9bee55c9 kill system call offloading from interrupt_syscall (tid == -1) change to one sided communication
refs #889
2017-07-03 14:48:42 +09:00
Tomoki Shirasawa
6e23b07b20 disable switch until to complete thread termination
refs #888
2017-07-03 14:47:48 +09:00
Masamichi Takagi
e64bd49d9e Add comment for x86_sregs 2017-07-03 10:43:36 +09:00
Masamichi Takagi
72b8f99d3b Correct comment for do_page_fault_process_vm() 2017-07-03 10:43:36 +09:00
Tomoki Shirasawa
090937a5a3 fix out of tree build 2017-06-30 09:57:50 +09:00
Tomoki Shirasawa
2082acdf0d add executer/user/arch/x86_64/Makefile.in 2017-06-28 09:36:31 +09:00
Tomoki Shirasawa
a8f11634e6 remove debug print for uti tracer 2017-06-27 14:42:04 +09:00
Tomoki Shirasawa
4f9865cc8f clean up unused code 2017-06-27 13:46:38 +09:00
Tomoki Shirasawa
07efb3ab9a support to utility thread offloading 2017-06-27 13:27:09 +09:00
Balazs Gerofi
2afc9d37d1 fix config.h inclusion 2017-06-17 07:05:33 +09:00
Masamichi Takagi
fa6f20a3c4 Correct comments in gencore.c 2017-06-16 21:47:23 +09:00
Balazs Gerofi
52bc052e1a mcexec: recursively bind mount $prefix/rootfs/ on / 2017-06-16 18:01:25 +09:00
Balazs Gerofi
f84415c310 mcexec: use atobytes() for MCKERNEL_RLIMIT_STACK 2017-06-15 16:50:34 +09:00
Balazs Gerofi
1a853e07d7 rus_vm_fault(): fix misaligned address before accessing PTE 2017-06-14 20:32:03 +09:00
Ken Sato
07b0954610 IKC: add ihk_ikc_direction to ihk_ikc_listen_param. refs #841 2017-06-13 16:33:15 +09:00
Balazs Gerofi
1f006b2381 remote_page_fault(): free remote PF response packet to avoid memory leak 2017-06-12 22:03:12 +09:00
Balazs Gerofi
4dfd806aa7 mcctrl: release syscall packets to LWK -> Linux channels 2017-06-12 22:02:32 +09:00
Balazs Gerofi
c6e3185246 mcctrl: clean up RUS page hash at job completion 2017-06-12 13:04:03 +09:00
Balazs Gerofi
d9e6ff235d mcctrl: track and clean up ikc2linux channels 2017-06-12 13:03:07 +09:00
Balazs Gerofi
b03f69783a mcctrl: cleanup devobj pagers in release_handle() to avoid memory leak 2017-06-11 19:13:31 +09:00
Balazs Gerofi
ab915f3331 mcctrl: clean up pagers for file objects to avoid memory leak 2017-06-11 19:11:54 +09:00
Tomoki Shirasawa
7773c4aef6 add log print for existing processes/threads
usage: ihkosctl 0 ioctl 40000000 [1-4]
1: print for existing processes
2: print for existing threads
3: print for existing processes without process lock
4: print for existing threads without thread lock
2017-06-11 15:19:24 +09:00
Ken Sato
58e531eb58 mcreboot: add taskset -c 0 to insmod. refs #848 2017-06-09 17:18:45 +09:00
Balazs Gerofi
9beef7d901 sysfs: fix directory memory leak 2017-06-09 15:51:41 +09:00
Balazs Gerofi
0733592eb5 mcexec_open_exec() fix filename memory leak 2017-06-09 15:51:14 +09:00
Balazs Gerofi
4d0e0728f4 destroy_thread(): disable IRQ while holding update lock 2017-06-08 17:40:35 +09:00
Balazs Gerofi
66fad4c7a4 terminate(): do not iterate process hash if no children processes exist 2017-06-08 14:53:57 +09:00
Balazs Gerofi
5758dba7cf use spinlocks in MCS rwlock 2017-06-08 14:16:29 +09:00
Balazs Gerofi
1ca16b9693 rusage: add kernel/include/config.h.in 2017-06-08 09:02:52 +09:00
Balazs Gerofi
d29922c820 configure: re-autoreconf 2017-06-07 17:33:32 +09:00
Balazs Gerofi
46b48ac59b __return_syscall(): verify response structure 2017-06-07 17:21:55 +09:00
Balazs Gerofi
446ef0465b mcctrl: verify ihk_device_map_virtual()'d buffer before accessing 2017-06-07 17:21:55 +09:00
Balazs Gerofi
200fe9aec4 mcctrl/mcexec: fix per-process data reference counting 2017-06-07 17:21:55 +09:00
Balazs Gerofi
fedba28a93 extend_process_region(): fix alignment 2017-06-07 17:21:55 +09:00
Masamichi Takagi
b527503937 Fix rusage 2017-06-07 15:15:20 +09:00
Masamichi Takagi
6bdafbd33b Fix rusage 2017-06-07 09:30:42 +09:00
Balazs Gerofi
12e7ed644f fileobj_flush_page(): do not offload for files with MF_HOST_RELEASED flag set 2017-06-05 22:20:25 +09:00
Tomoki Shirasawa
edf059888d support rusage parameter of wait4
refs #857
2017-05-28 07:52:47 +09:00
shirasawa
a66fb96cd9 re-autoconf 2017-05-28 07:52:38 +09:00
Balazs Gerofi
dd2ef89997 SMP: generic function call facility for CPU sets 2017-05-28 07:41:48 +09:00
Balazs Gerofi
ba7edf1981 move out local IRQ vector definitions to shared header 2017-05-28 07:36:21 +09:00
Balazs Gerofi
a669fc5125 extend_process_region(): align to heap extension 2017-05-26 15:45:57 +09:00
Balazs Gerofi
c0cabc2d83 brk(): return old address if memory allocation fails 2017-05-26 15:41:38 +09:00
Balazs Gerofi
e306b1e838 fileobj_create(): fix --mpol-shm-premap for Quadrant mode 2017-05-31 08:33:29 +09:00
Balazs Gerofi
0c3b705f98 brk(): make aggressive heap extension optional 2017-05-24 01:41:54 +09:00
Balazs Gerofi
9f55263528 mcexec: atobytes() to convert size string to # of bytes 2017-05-24 01:41:54 +09:00
Balazs Gerofi
74c5f61fd5 mmap(): fix populate_len warning 2017-05-24 01:41:54 +09:00
Balazs Gerofi
cadb66e5c1 init_host_ikc2linux(): adjust minimum queue size 2017-05-23 20:00:09 +09:00
Balazs Gerofi
9b5ccb5a33 Pre-map file mappings from /dev/shm (--mpol-shm-premap mcexec argument) 2017-05-23 20:00:06 +09:00
Balazs Gerofi
c5079898c2 mckernel_allocate_aligned_pages_node(): support explicit NUMA node designation 2017-05-23 19:58:52 +09:00
Balazs Gerofi
746b459e7f profile: more detailed profiling of file PFs 2017-05-23 19:58:52 +09:00
Balazs Gerofi
4c42086154 profile: fix job level clearing 2017-05-23 19:58:52 +09:00
Balazs Gerofi
56ee0787c9 profiler: function to clear process level logs 2017-05-23 19:58:52 +09:00
Balazs Gerofi
e901d42fb6 mcexec: --extend-heap-by: argument to specify heap extension size 2017-05-23 19:58:49 +09:00
Balazs Gerofi
29ab087fa2 execve(): larger allocation for program descriptor 2017-05-23 19:57:08 +09:00
Balazs Gerofi
105d373765 PROFILE_page_fault_XXX: more detailed page PF profiling 2017-05-23 19:57:08 +09:00
Balazs Gerofi
0dd2fad33b brk(): more forceful heap extension 2017-05-23 19:57:08 +09:00
Balazs Gerofi
e554f4e2f9 mcexec: --disable-sched-yield: avoid kernel/user switch 2017-05-23 19:57:08 +09:00
Balazs Gerofi
a256280118 PROFILE_mmap_XXX: more detailed mmap profiling 2017-05-23 19:57:08 +09:00
Balazs Gerofi
d75be7228b PROFILE_mmap_anon_no_contig_phys: profile ANON mmap()s that couldn't be backed by contiguous physical memory 2017-05-23 02:42:06 +09:00
Balazs Gerofi
923dc4aa11 PROFILE_mpol_alloc_missed: profile allocations that fail to satisfy user requested memory policy 2017-05-23 02:42:06 +09:00
Balazs Gerofi
e3e0f6a174 mcexec: introduction of --profile 2017-05-23 02:42:06 +09:00
Balazs Gerofi
dd6f721e03 profile: job level event accumulation 2017-05-23 02:42:06 +09:00
Balazs Gerofi
9c25d47d9b mcexec: transfer job information to LWK 2017-05-23 02:42:06 +09:00
Balazs Gerofi
5a4148aaaf ___kfree(): disregard NULL pointer argument 2017-05-23 02:42:06 +09:00
Balazs Gerofi
32c8f6192d unhandled_page_fault(): print registers for kernel mode PF 2017-05-23 02:42:05 +09:00
Balazs Gerofi
e2f424846c profile: rewrite syscall tracker for generic profiling code 2017-05-23 02:42:05 +09:00
Balazs Gerofi
989af7e045 mcexec: RLIMIT_STACK handling 2017-05-23 02:39:42 +09:00
Balazs Gerofi
721cee05a2 MPOL default threshold to 0 2017-05-23 02:39:42 +09:00
Balazs Gerofi
86aa76e088 IKC: increase ikc2linux channels' queue size 2017-05-23 02:39:42 +09:00
Balazs Gerofi
ab113658f1 mcexec: --no-bind-ikc-map for optionally disabling binding 2017-05-23 02:39:42 +09:00
Balazs Gerofi
2d72042021 mcexec: bind to CPus according to ikc_map 2017-05-23 02:39:42 +09:00
Balazs Gerofi
610463ff39 sched_setaffinity(): respect process cpu_set 2017-05-23 02:39:42 +09:00
Balazs Gerofi
dfb0a37305 procfs: increase procfs request timeout 2017-05-23 02:39:42 +09:00
Balazs Gerofi
26b9484bae mcexec: --mpol-threshold to control MPOL_BIND/MPOL_PREFERRED 2017-05-23 02:39:42 +09:00
Balazs Gerofi
b4aecfd43c partitioned execution: order by process start time 2017-05-23 02:39:42 +09:00
Balazs Gerofi
bf036f19f7 mcreboot: offline/re-online RAM before IHK reserve 2017-05-23 02:39:42 +09:00
Balazs Gerofi
182202523e mcexec/mm: user memory policy control for heap, stack, etc. 2017-05-23 02:39:42 +09:00
Balazs Gerofi
afb7cb3a1e BSS/data: demand paging for non-file section and respect user requested NUMA allocation policy 2017-05-23 02:39:41 +09:00
Balazs Gerofi
fdbdcbd0ee VR_AP_USER: memory range flag to respect user mempolicy (e.g., in PF handler) 2017-05-23 02:39:41 +09:00
Balazs Gerofi
a18fd1f45c sched_yield(): optionally disable wait 2017-05-23 02:39:41 +09:00
Balazs Gerofi
d8170e292c init_process_stack(): debug msg format 2017-05-23 02:39:41 +09:00
Balazs Gerofi
fee5234c54 stack: force transparent large pages 2017-05-23 02:39:41 +09:00
Balazs Gerofi
6309095fd2 brk(): force transparent large pages 2017-05-23 02:39:41 +09:00
Balazs Gerofi
b005adc103 SCD_MSG_PERF_CTRL: use IKC3 channel for response packet 2017-05-20 12:43:08 +09:00
Balazs Gerofi
21373338cc mcctrl: IHK CPU register manipulation implementation 2017-05-20 12:38:14 +09:00
Balazs Gerofi
39352cd364 event_signal(): use IKC3 ikc2linux channel 2017-05-19 10:31:15 +09:00
Katsukura
84025cc9cb configure : add option --enable-rusage 2017-05-19 10:31:14 +09:00
Yoichi Umezawa
04cbfbb025 xpmem: porting xpmem v2.6.3
implement xpmem_get, xpmem_release, xpmem_attach, xpmem_detach
2017-05-19 10:30:36 +09:00
Katsukura
ba58054c9d create rusage branch. 2017-05-19 10:30:36 +09:00
Ken Sato
7fd55dc83f IKC: only cpu 0 check the master-channel 2017-05-19 10:26:30 +09:00
Ken Sato
d66af42f7b Revert "IKC: separate IRQ between Master-channel and Regular-channel"
This reverts commit 3c98b9410966ceebe187ebae1038317b628fbb03.
2017-05-19 10:26:30 +09:00
Balazs Gerofi
4b964b8e0d IKC: allocate Linux channel table dynamically 2017-05-19 10:26:30 +09:00
Ken Sato
65dc3440cb IKC: separate IRQ between Master-channel and Regular-channel 2017-05-19 10:26:30 +09:00
Ken Sato
fbd9086ce5 IKC: delete recieve channel list 2017-05-19 10:26:29 +09:00
Ken Sato
c2b1d8e3ef IKC: delete the comments for review 2017-05-19 10:26:29 +09:00
Balazs Gerofi
e2d59e2cb9 mcreboot-smp: introduction of ikc_irq_start argument 2017-05-19 10:26:29 +09:00
Balazs Gerofi
3de0f5ea19 mcreboot-smp: introduction of ikc_map argument 2017-05-19 10:26:29 +09:00
Balazs Gerofi
373e9ea63c ap_wait(): init syscall channel with proper Linux remote CPU 2017-05-19 10:26:29 +09:00
Ken Sato
8daffa939e IKC: distribute IKC-interrupt to Linux cpus. 2017-05-19 10:26:29 +09:00
Balazs Gerofi
eaa4d35fab do_migrate(): don't clear oversubscribed source CPUs from remote TLB mask 2017-05-17 11:22:29 +09:00
Ken Sato
a968c935b5 Fix timing of save/restore smp_affinity, and modifing of /proc/irq/*/smp_affinity 2017-05-15 14:52:22 +09:00
Balazs Gerofi
e01f6dd6ea eclair: obtain kernel_base from dump_mem_chunks_t 2017-05-12 13:23:23 +09:00
Masamichi Takagi
a07d802cbe Fix manipulation of /proc/irq/*/smp_affinity
Fix the case where
(1) #CPUs % 32 == 0
(2) #CPUs % 4 != 0
2017-05-12 09:35:49 +09:00
Ken Sato
1e442cce10 mcklogd: fixed termination method of mcklogd 2017-05-09 16:28:21 +09:00
Ken Sato
3f870b69a6 mcklogd: change the timing of start/stop. 2017-05-09 16:06:07 +09:00
Balazs Gerofi
0fef80cb19 SCD_MSG_CPU_RW_REG: use syscall channel for reply packet in CPU MSR read/write operation 2017-05-05 00:16:02 +09:00
Balazs Gerofi
9992fe0d72 mcctrl: support remote CPU MSR read/write operations 2017-05-05 00:01:43 +09:00
Balazs Gerofi
2d19ed9391 configure.ac: check NUMA development library 2017-04-29 05:30:27 +09:00
Balazs Gerofi
2f2f04d5a1 mcexec: ENABLE_MCOVERLAYFS on CentOS for up to version 7.3 2017-04-29 05:10:21 +09:00
Ken Sato
1541b26086 ihklib: add pa_info functions. 2017-04-27 17:13:49 +09:00
Ken Sato
e6c4d7731d Merge remote-tracking branch 'origin/rusage'
Conflicts:
	configure
	kernel/process.c
2017-04-27 15:10:38 +09:00
Katsukura
94b527e027 modified: lib/include/ihk/rusage.h 2017-04-27 14:47:21 +09:00
Katsukura
8c9b207557 configure : add option --enable-rusage 2017-04-27 14:00:59 +09:00
Balazs Gerofi
dacb05844b mcoverlayfs: support compile up to 3.10.0-514 2017-04-20 00:48:56 +09:00
Balazs Gerofi
c3ec5d20ca configure: --with-uname_r: optionally specify target kernel version string 2017-04-20 00:48:56 +09:00
Balazs Gerofi
92a40f92dd mcctrl_put_per_proc_data(): do not use task_pid_vnr() in IRQ context 2017-03-30 15:02:57 +09:00
Balazs Gerofi
45bddf3caa mcexec_syscall(): do not use task_pid_vnr() in IRQ context 2017-03-30 14:56:57 +09:00
Balazs Gerofi
b7671fedd3 mcctrl_per_proc_data: comments 2017-03-30 14:51:24 +09:00
Yoichi Umezawa
c38d536aaa xpmem: porting xpmem v2.6.3
implement xpmem_get, xpmem_release, xpmem_attach, xpmem_detach
2017-03-29 18:20:53 +09:00
Yoichi Umezawa
4ee0c05e08 mcoverlayfs: fix NULL pointer dereference on ovl_dentry_release() 2017-03-28 21:52:41 +09:00
Tomoki Shirasawa
f2ab0193e5 fix to panic when thread end and signal overlap. 2017-03-28 11:31:27 +09:00
Tomoki Shirasawa
ef910fdf0e Discard outstanding system calls at the end of mcexec. 2017-03-28 11:23:54 +09:00
Balazs Gerofi
b97a8c5138 mcexec_open_exec(): use strncpy_from_user() before accessing file name 2017-03-21 20:13:12 +09:00
Tomoki Shirasawa
034d10b185 When receiving a signal during fuex processing, the signal is not processed. 2017-03-21 20:37:17 +09:00
Katsukura
3fe2257929 create rusage branch. 2017-03-15 23:22:51 +09:00
Tomoki Shirasawa
eca4018ecb mcctrl: release syscall packets when mcexec termination
refs #835
2017-03-11 20:57:54 +09:00
Tomoki Shirasawa
e936b2ebe1 memobj_release: don't call syscall_generic_forwarding after process termination
refs #816
2017-03-10 12:58:47 +09:00
Tomoki Shirasawa
d8112f92f8 terminate(): don't call free_all_process_memory_range
refs #816
2017-03-08 14:30:28 +09:00
Masamichi Takagi
1076010de4 Boundary check in early_alloc_pages() 2017-03-04 17:21:57 +09:00
Balazs Gerofi
da4a5ec44b page_allocator_init(): move memory_nodes to BSS 2017-02-24 19:33:25 +09:00
Balazs Gerofi
d35aa9b100 page_allocator_init(): clean-up code, eliminate initial flag 2017-02-24 14:25:22 +09:00
e29005
ba8dbf1b19 Put kernel image and page table into one chunk 2017-02-24 14:21:32 +09:00
Yoichi Umezawa
6213f0e488 mcctrl: fix cpumask macros for Linux 4.6 2017-02-02 15:49:39 +09:00
Balazs Gerofi
4ef82c2683 OFP-SNC-4: offline/online MCDRAM before memory reservation 2017-01-30 14:47:36 +09:00
Balazs Gerofi
e066a8798c IKC: adjust master channel queue size to nr. of CPUs 2017-01-30 07:24:09 +09:00
Balazs Gerofi
b702c9691e AP init: synchronize syscall channel initialization 2017-01-30 07:24:09 +09:00
Balazs Gerofi
addbe91e59 do_migrate(): signal migrated thread before releasing runq lock 2017-01-30 07:24:09 +09:00
Balazs Gerofi
b812848a0e eclair-dump-backtrace.exp: handle user space threads 2017-01-30 07:24:09 +09:00
Balazs Gerofi
ad214c8206 reserve_user_space(): mutual exclusion on mmap 2017-01-30 07:24:09 +09:00
Balazs Gerofi
1bc3218fc1 partitioned execution: bind mcexec to corresponding NUMA node 2017-01-30 07:24:09 +09:00
Balazs Gerofi
5cc420a6c3 syscall/offload tracker: clean-up and support process-wise aggregation 2017-01-30 07:24:09 +09:00
Balazs Gerofi
c7686fdf4e execve(): fix memory leak 2017-01-30 07:24:09 +09:00
Balazs Gerofi
c1dae4d8b0 mmap(): no physical memory pre-allocation for Intel 128MB mapping 2017-01-30 07:24:08 +09:00
Yoichi Umezawa
2473025201 do_mmap(): remove codes for debug
refs #395
2017-01-16 15:53:27 +09:00
Balazs Gerofi
fa5c1b23ca eclair-dump-backtrace.exp: dump full backtrace of all mckernel threads 2017-01-15 10:46:07 +09:00
Balazs Gerofi
f2f499aace mcreboot/stop: toggle address-space layout randomization (ASLR) to avoid mcexec user-space reservation failure 2017-01-15 10:36:50 +09:00
Balazs Gerofi
bd47b909bf futex(): spin wait when CPU not oversubscribed and fix lost wake-up bug 2017-01-13 08:43:25 +09:00
Balazs Gerofi
d646c2a4b9 cpu_set/clear(): unsigned long for IRQ flags 2017-01-13 08:43:25 +09:00
Balazs Gerofi
865ada46bf IKC2: eliminate unused IKC structures 2017-01-13 08:43:25 +09:00
Balazs Gerofi
cdffc5e853 do_syscall(): eliminate centralized lock for exit/kill code path (use IKC2 thread pool) 2017-01-08 14:16:10 +09:00
Balazs Gerofi
0e67e9266b ap_init(): reformat AP cores report 2017-01-08 14:16:10 +09:00
Balazs Gerofi
1ff0afe6fb devobj/fileobj: do not try to free memory for device file mappings 2017-01-08 14:16:10 +09:00
Balazs Gerofi
d34884f9a4 numa_init(): error handling and propagation 2017-01-08 14:15:51 +09:00
Balazs Gerofi
7a0c204dc1 eclair: report PID for all threads 2017-01-08 14:15:44 +09:00
Balazs Gerofi
25f67c9ef8 mcreboot/mcstop-smp-x86: surpress libkmod warnings 2017-01-08 14:15:34 +09:00
Balazs Gerofi
a776464a7e mcreboot/mcstop: adjust swappiness 2017-01-03 09:02:41 +09:00
Balazs Gerofi
c40e7105e6 NUMA: order nodes by distance for MPOL_BIND / MPOL_PREFERRED policies as well 2017-01-03 09:02:29 +09:00
Balazs Gerofi
5bac38ce8b mmap()/stack/heap: follow user requested NUMA policy 2016-12-31 19:38:05 +09:00
Balazs Gerofi
e3f0662130 allocate_aligned_pages_node(): debug msg format 2016-12-31 16:25:14 +09:00
Balazs Gerofi
21df56b233 sched_wakeup_thread(): memory barrier after status update 2016-12-31 10:44:13 +09:00
Balazs Gerofi
393cec513c allocate_aligned_pages_node(): follow user policiy only for user allocations 2016-12-31 10:10:42 +09:00
Balazs Gerofi
4437ecc69a do_mmap(): indicate user level allocations for anonymous mappings 2016-12-31 10:09:49 +09:00
Balazs Gerofi
40d75baca2 ihk_mc_ap_flag: rewrite flag type, intro for denoting user level allocations 2016-12-30 19:19:34 +09:00
Balazs Gerofi
00f3fe0840 ihk_mc_alloc_aligned_pages_node(): support for explicit indication of target NUMA node 2016-12-30 19:03:59 +09:00
Balazs Gerofi
47a8b5bda5 mmap(): faster pre-allocation for anonymous private mappings 2016-12-30 17:18:44 +09:00
Balazs Gerofi
ec75095073 add_process_memory_range(): optionally return range object 2016-12-30 15:51:17 +09:00
Balazs Gerofi
1794232989 irqbalance_mck: create environment file in /tmp to avoid race condition on PFS 2016-12-30 15:47:44 +09:00
Balazs Gerofi
40978d162e procfs_read/write(): rewrite synchronization for scalability and correctness 2016-12-28 14:17:17 +09:00
Balazs Gerofi
536ce9f927 process_procfs_request(): use IRQ save MCS locks while iterating thread list to avoid deadlock 2016-12-28 12:29:10 +09:00
Balazs Gerofi
4e5ec74ffe mmap(): fault in memory only up to file size for populated file mappings 2016-12-27 16:33:24 +09:00
Balazs Gerofi
a6d8125fd7 mcreboot-smp-x86: reserve memory first and then CPUs 2016-12-27 15:19:05 +09:00
Balazs Gerofi
15d3a0361e destroy_ikc_channels(): eliminate kprint from error free path 2016-12-27 11:52:24 +09:00
Balazs Gerofi
6ad84a96a3 mcexec_syscall(): avoid calling task_pid_nr_ns() in IRQ context 2016-12-26 20:43:17 +09:00
Balazs Gerofi
16e846e9b6 mcexec: report error in prepare_image() if wait queue interrupted 2016-12-26 20:42:31 +09:00
Balazs Gerofi
5bc7185f07 do_migrate(): update debug msg format 2016-12-25 17:34:26 +09:00
Balazs Gerofi
32462dfb2d eclair: fix CPU number display for non-active threads 2016-12-25 17:28:31 +09:00
Balazs Gerofi
e3ef88c0cf do_sigsuspend(): deschedule thread when neccessary (fixes gdb deadlock) 2016-12-25 17:24:32 +09:00
Balazs Gerofi
829aae7b8d mcexec: PATH_MAX buffer lenght in do_generic_syscall() 2016-12-25 17:20:14 +09:00
Balazs Gerofi
b836b84825 mcexec_prepare_image(): use memory barrier when updating request status 2016-12-25 17:19:14 +09:00
Balazs Gerofi
3e1f154412 patch_process_vm(): eliminate kprintfs from error free code path 2016-12-25 17:18:20 +09:00
Balazs Gerofi
e7af537452 get_pid_cred(): proper locking around pid_task 2016-12-25 17:17:27 +09:00
Balazs Gerofi
3565959af7 eclair: fix compiler warnings 2016-12-23 09:57:50 +09:00
Balazs Gerofi
4667136a4c mcctrl: refcount per-process data to avoid corrupted syscall request lists 2016-12-23 09:54:15 +09:00
Balazs Gerofi
972d14611a mcctrl: move prepare waitqueue to per-process data 2016-12-22 10:15:31 +09:00
Balazs Gerofi
e90eef8910 eclair: support for direct memory inspection 2016-12-21 21:55:32 +09:00
Balazs Gerofi
f81927b85b Revert "brk(): larger allocation units internally"
This reverts commit c58ab0f648.
2016-12-20 11:11:09 +09:00
Balazs Gerofi
701cdcdab1 use MCS locks in physical memory allocator 2016-12-19 12:57:59 +09:00
Balazs Gerofi
9635a628a9 fileobj/shmobj/devobj: add file size to memobj 2016-12-19 12:55:12 +09:00
Balazs Gerofi
3e1b16f3fc syscall_channel: increase queue size to avoid deadlock in ikc_send() 2016-12-18 21:12:38 +09:00
Balazs Gerofi
ff37ff9ccf memobj: synch prefetch among processes 2016-12-18 21:12:38 +09:00
Balazs Gerofi
5b7bcb7170 fileobj: use read/write MCS locks in page hash 2016-12-18 21:12:37 +09:00
Balazs Gerofi
6a5fe90f98 mcexec_get_cpuset(): save CPU set and IKC target cpu in per-process data 2016-12-18 21:12:37 +09:00
Balazs Gerofi
91373337ba mcctrl: add IKC target CPU to OS file release_handler 2016-12-18 21:12:37 +09:00
Balazs Gerofi
56ed726a88 pager_req_create(): prefetch for MPI library and zerofill for shm 2016-12-18 21:12:37 +09:00
Balazs Gerofi
bce10e11e4 fileobj: rewrite for scalability using per-file page hash 2016-12-18 21:12:37 +09:00
Balazs Gerofi
91cdb16158 MCS lock: separate IRQ disable/enable versions 2016-12-18 21:12:37 +09:00
Balazs Gerofi
c58ab0f648 brk(): larger allocation units internally 2016-12-18 21:12:37 +09:00
Yoichi Umezawa
f410af1cfc xpmem: porting xpmem v2.6.3
implement xpmem_make, xpmem_remove
2016-12-16 17:00:09 +09:00
Balazs Gerofi
aa15e5eea8 mcexec: -t option and OMP_NUM_THREADS for thread pool size 2016-12-14 18:56:30 +09:00
Balazs Gerofi
df9f1f8f78 allocate_aligned_pages(): take user set NUMA policy into account 2016-12-13 17:51:39 +09:00
Balazs Gerofi
7ace35d737 mcexec_get_cpuset(): fix NUMA search bug 2016-12-13 17:50:50 +09:00
Balazs Gerofi
551999ff6b NUMA: order nodes based on distances 2016-12-13 10:46:17 +09:00
Balazs Gerofi
052b3f44ca mcexec: -n: topology aware partitioned execution 2016-12-10 16:27:57 +09:00
Balazs Gerofi
fdcf766337 prepare_process(): pass cpu_set in program_load_desc 2016-12-09 16:32:20 +09:00
Balazs Gerofi
7d13bfb14e set_mempolicy(): limit maxnode to PROCESS_NUMA_MASK_BITS 2016-12-08 21:05:10 +09:00
Ken Sato
202bfd9955 IHK-API: expand and fix for ver 1.2. 2016-12-08 17:28:53 +09:00
Balazs Gerofi
c99e36235b execve(): disable debug warnings 2016-12-08 16:33:24 +09:00
Balazs Gerofi
3cecafac59 obtain_clone_cpuid(): respect parent's CPU set 2016-12-08 16:01:30 +09:00
Balazs Gerofi
61fc4c5e55 show_context_stack(): fix warning 2016-12-07 11:42:09 +09:00
Balazs Gerofi
fad73cacc1 x86: display call stack for IRQ 133 (for debug) 2016-12-07 11:32:02 +09:00
Balazs Gerofi
8fced29978 page_fault_handler(): improved debug msg format 2016-12-07 11:25:02 +09:00
Balazs Gerofi
b0f4ae4890 ihk_mc_pt_set_pte(): double check phys address alignment 2016-12-07 11:23:45 +09:00
Balazs Gerofi
7070094a31 ihk_mc_pt_print_pte(): handle large pages correctly 2016-12-07 11:13:53 +09:00
Balazs Gerofi
011185e3f7 __ihk_pagealloc_large(): fix 1GB page alignment bug 2016-12-07 09:38:37 +09:00
Balazs Gerofi
461881e46a /proc/mckernel to indicate McKernel 2016-12-06 14:29:25 +09:00
Balazs Gerofi
ddc33821cf sched_yield(): avoid schedule for single thread 2016-12-05 18:10:20 +09:00
Balazs Gerofi
0ab7d02994 disable syscall tracker and eliminate interrupt_syscall debug msg 2016-12-05 18:10:20 +09:00
Balazs Gerofi
a8c4ab221b use MCS locks in signal handling code 2016-12-05 18:10:20 +09:00
Balazs Gerofi
87d36a7752 mcreboot-smp-x86: -t to enable turbo boost 2016-12-05 18:10:20 +09:00
Balazs Gerofi
998ded414c mcreboot-smp-x86: shorter sleep in waiting for /proc 2016-12-05 18:10:20 +09:00
Balazs Gerofi
f78d031e64 syscall and offload tracking (disabled by default) 2016-12-05 18:10:20 +09:00
e29001
4ab37dd34a schedule(): only load page table during context switch if it's different 2016-12-05 18:10:20 +09:00
Masamichi Takagi
8129dec2f7 Fix out-of-tree build
<build>/ihk/cokernel/Makefile.common is not found when
<build>/mckernel/kernel/Makfile tries to perform
"Make -C <build>/ihk/{cokernel,ikc}" from mckernel/kernel
2016-12-01 16:44:01 +09:00
Tomoki Shirasawa
a1035a1878 fix out of tree build 2016-12-01 12:55:34 +09:00
Yoichi Umezawa
db169c5f90 add gcc options (-ffreestanding -fno-tree-loop-distribute-patterns)
refs #299
2016-11-29 16:28:18 +09:00
Tomoki Shirasawa
bbb55ef261 sched_setparam: thread lock is necessary when update other thread data 2016-11-28 14:04:44 +09:00
Ken Sato
1130cafe41 ptrace: fixed for threads. 2016-11-28 11:19:30 +09:00
Balazs Gerofi
a1cf27e232 sched_getaffinity(): fix error code for special invalid input 2016-11-28 05:50:01 +09:00
Balazs Gerofi
5a1ce99d87 mcexec: fix number of threads not to exceed thread_data array 2016-11-27 07:31:52 +09:00
Balazs Gerofi
c7db296e1b getcpu(): expose correct NUMA id 2016-11-26 09:29:09 +09:00
Balazs Gerofi
f634a750c5 sched_{set/get}affinity(): fix error codes (also fixes KMP_AFFINITY behavior) 2016-11-24 21:25:16 +09:00
Balazs Gerofi
d07a196c8e mcexec: enable the same number of threads as CPU cores 2016-11-24 16:40:52 +09:00
Balazs Gerofi
8c56c75d2c process_vm_read_writev(): fix base address check for EFAULT 2016-11-24 10:40:41 +09:00
Balazs Gerofi
e54895efde set_mempolicy(): debug msg 2016-11-23 08:53:26 +09:00
Balazs Gerofi
2f8cca2d6d memcpy(): faster version using ASM rep; movsl 2016-11-23 08:51:22 +09:00
Balazs Gerofi
64607152ee VM: introduction of range lookup cache 2016-11-23 08:48:44 +09:00
Balazs Gerofi
20383ad3d0 do_process_vm_read_writev(): page size awareness optimization 2016-11-23 08:47:32 +09:00
Balazs Gerofi
787d34f650 introduction of ihk_mc_pt_virt_to_phys_size() 2016-11-23 08:40:33 +09:00
Balazs Gerofi
ae618a0c68 mcexec: remount /proc in mcexec's file NS after exec() 2016-11-22 13:22:59 +09:00
Yoichi Umezawa
f480376153 mcoverlayfs: supported Linux kernel 4.6
add mcoverlayfs(linux-4.6.7 base)
2016-11-17 18:09:27 +09:00
Balazs Gerofi
e4b3a88fc6 mcexec_sys_umount(): remove debug print 2016-11-10 15:05:45 +09:00
Balazs Gerofi
69a5c53074 NUMA: hide non-existing nodes from /sys/devices/system/node listing 2016-11-05 16:12:08 +09:00
Balazs Gerofi
259583e936 mcreboot-smp-x86.sh: more white out of invalid NUMA info 2016-11-05 13:35:53 +09:00
Balazs Gerofi
0f826290d0 NUMA: get_mempolicy(), set_mempolicy() and mbind() implementation 2016-11-05 13:32:02 +09:00
Balazs Gerofi
e46f027894 mcexec/mcctrl: unmount cgroups (privately) which expose invalid NUMA info 2016-11-04 17:02:48 +09:00
Balazs Gerofi
3e093f6a40 sysfs: fix /sys/devices/system/node/online value 2016-11-03 16:10:29 +09:00
Balazs Gerofi
00996b551f mcreboot: white out non-existing NUMA information 2016-11-03 16:09:27 +09:00
Balazs Gerofi
24d8697cef mcexec: workaround for overlayed /sys FS directory lseek() bug
lseek() on directories under /sys filesystem that are part of an
overlayed filesystem behave differently than in the original /sys.
This causes segfault in libnuma when discovering topology
information. The patch fakes return value as it is supposed to be,
which also fixes the Intel MPI 2017 MPI_Init() crash.
2016-11-03 13:41:25 +09:00
Balazs Gerofi
be4f6741f9 sysfs: fix /sys/devices/system/cpu/cpuXX/online value 2016-11-03 13:39:21 +09:00
Balazs Gerofi
7a2f67f5f0 sysfs: eliminate unnecessary new line from /sys/devices/system/node/nodeX/distance 2016-11-03 13:37:53 +09:00
Balazs Gerofi
bba0425267 sysfs: fix /sys/devices/system/cpu/online value 2016-11-03 13:36:29 +09:00
Balazs Gerofi
beaf96b375 mcreboot/mcstop: proper error handling (revert previous state) 2016-10-28 14:29:10 +09:00
Balazs Gerofi
f1af1ffb8f NUMA: expose correct NUMA distances in sysfs 2016-10-27 14:29:15 +09:00
Balazs Gerofi
059fab2cc0 mcctrl: fix NULL pointer dereference for unbooted OS instance shutdown 2016-10-26 14:50:07 +09:00
Masamichi Takagi
f284a80656 Defrag memory in mcreboot.sh
Merge free physical pages to create large, physically contiguous
blocks with the following command.

    echo 1 > /proc/sys/vm/compact_memory
2016-10-25 16:35:43 +09:00
Balazs Gerofi
5f973ab51e IKC2: adjust master channel message queue size dynamically
Determine master channel's message queue size based on the number of
LWK CPUs so that all cores can communicate simultaneously during
syscall channel initialization.
2016-10-24 20:49:00 +09:00
Balazs Gerofi
60b6713957 IKC2: eliminate unused structures/fields of old IKC code 2016-10-24 15:41:27 +09:00
Balazs Gerofi
ebcf9a0d6d mcctrl: fix a bunch of -Wframe-larger-than warnings 2016-10-21 04:54:38 -04:00
Balazs Gerofi
942b7f8b78 mcreboot-smp-x86: eliminate unnecessary resource queries 2016-10-21 03:38:21 -04:00
e29005
0b0aa6c0e0 Start mcklogd before McKernel to avoid deadlock
McKernel blocks forever waiting for mcklogd to retrieve kmsg when
kmsg bufer is full with boot log and mcklogd isn't running.
2016-10-19 16:40:32 +09:00
Balazs Gerofi
9705a80c82 get/set_mempolicy(): support for query/set process level policy 2016-10-16 14:01:14 +09:00
Balazs Gerofi
99a02e2941 get_mempolicy(): store policy in per-process VM structure 2016-10-16 09:10:36 +09:00
Balazs Gerofi
b88d75720f __NR_gettid: use regular offloading channel (fixes unknown PID bug) 2016-10-15 11:46:01 +09:00
Balazs Gerofi
d2b677b6da get_mempolicy(): initial implementation 2016-10-14 21:34:32 +09:00
Balazs Gerofi
083645f203 mcreboot: purge Linux caches before reserving IHK resources 2016-10-14 21:34:32 +09:00
Balazs Gerofi
994b9a19ac NUMA: expose CPU and memory info in /proc/self/status 2016-10-14 21:34:32 +09:00
Balazs Gerofi
faa929e717 NUMA: add NUMA mask to process VM structure 2016-10-14 21:34:31 +09:00
Balazs Gerofi
3ee3a9df6d sysfs: fix bitmask and bitmask list-view display bug 2016-10-14 21:34:31 +09:00
Balazs Gerofi
73e1a4f1f9 NUMA: fill in /sys/devices/system/cpu/nodeX properly and sync with boot script 2016-10-14 21:34:31 +09:00
Balazs Gerofi
b068fde9cd NUMA: use IHK CPU and NUMA mappings for sysfs entries 2016-10-14 21:34:31 +09:00
Balazs Gerofi
167ea67dee NUMA: receive CPU info in array format 2016-10-14 21:34:31 +09:00
Balazs Gerofi
f33d85a27a eclair: support for multiple physical memory chunks 2016-10-14 21:34:31 +09:00
Balazs Gerofi
1e8239d72a kmalloc/pagealloc tracker: fix race condition bug 2016-10-14 21:34:31 +09:00
Balazs Gerofi
a51a0a6f13 page allocation tracker: support tracking partial deallocations 2016-10-14 21:34:31 +09:00
Balazs Gerofi
cc3f6e1a4f page_fault_process_memory_range(): fix double allocation leak 2016-10-14 21:34:31 +09:00
Balazs Gerofi
5db6c311f4 page alloc tracker: count freed pages in addr tracker objects 2016-10-14 21:34:31 +09:00
Balazs Gerofi
f4df713846 munmap(): fix memory leak in non page backed mappings 2016-10-14 21:34:31 +09:00
Balazs Gerofi
7176bb2a47 allow partial deallocation in page level allocation tracker 2016-10-14 21:34:30 +09:00
Balazs Gerofi
a6bd98cc02 MM: memory leak tracker for page level allocator 2016-10-14 21:34:30 +09:00
Balazs Gerofi
0f7462ae1c mm.h: eliminate global pa_allocator 2016-10-14 21:34:30 +09:00
Balazs Gerofi
0d8d915d82 fix KMALLOC_MIN_SIZE macro 2016-10-14 21:34:30 +09:00
Balazs Gerofi
8f4f68b877 eliminate arch_alloc_page() and move ihk_mc_alloc_pages() to arch independent code 2016-10-14 21:34:30 +09:00
Balazs Gerofi
8c0a5a5e61 page_hash_count_pages(): report page hash size in memory stat 2016-10-14 21:34:30 +09:00
Balazs Gerofi
ffd3f53785 page_unmap(): proper locking of hash table 2016-10-14 21:34:30 +09:00
Balazs Gerofi
f39fa54c39 NUMA: default policy: allocate from CPU's NUMA node 2016-10-14 21:34:30 +09:00
Balazs Gerofi
11125b0d68 fileobj and shmemobj: delete unused variables 2016-10-14 21:34:30 +09:00
Balazs Gerofi
3ae69d1290 NUMA: process CPU NUMA information 2016-10-14 21:34:30 +09:00
Balazs Gerofi
2929fbb803 NUMA: support multiple physical allocators 2016-10-14 21:34:30 +09:00
Balazs Gerofi
f4db8b96de fileobj/shmobj: release pages correctly according to dynamic page frame management 2016-10-14 21:34:30 +09:00
Balazs Gerofi
8eb3bf3559 physical page management: eliminate static page frame array and
maintain page structures dynamically covering only file mappings.
use hash table for address <-> page structure conversion.
2016-10-14 21:34:29 +09:00
Balazs Gerofi
326a4fcee4 mem_init(): parse NUMA information 2016-10-14 21:34:29 +09:00
Balazs Gerofi
9b82f1a52c use ihk_mc_alloc/free_pages() and eliminate direct calls to low level routines 2016-10-14 21:34:29 +09:00
Ken Sato
f3da381752 ihk_mc_unmap_virtual: add flush_tlb_single
refs #778
2016-10-11 14:44:23 +09:00
Tomoki Shirasawa
8aa589a40c A signal may not sometimes arrive to a thread. 2016-10-04 14:35:25 +09:00
Tomoki Shirasawa
e03f377326 interrupt_syscall: interrupt valid thread 2016-10-03 00:49:56 +09:00
105 changed files with 23486 additions and 3439 deletions

View File

@@ -49,6 +49,7 @@ install::
mkdir -p -m 755 $(SBINDIR); \
install -m 755 arch/x86/tools/mcreboot-smp-x86.sh $(SBINDIR)/mcreboot.sh; \
install -m 755 arch/x86/tools/mcstop+release-smp-x86.sh $(SBINDIR)/mcstop+release.sh; \
install -m 755 arch/x86/tools/eclair-dump-backtrace.exp $(SBINDIR)/eclair-dump-backtrace.exp;\
mkdir -p -m 755 $(ETCDIR); \
install -m 644 arch/x86/tools/irqbalance_mck.service $(ETCDIR)/irqbalance_mck.service; \
install -m 644 arch/x86/tools/irqbalance_mck.in $(ETCDIR)/irqbalance_mck.in; \

View File

@@ -30,6 +30,7 @@
#include <cls.h>
#include <prctl.h>
#include <page.h>
#include <kmalloc.h>
#define LAPIC_ID 0x020
#define LAPIC_TIMER 0x320
@@ -42,8 +43,6 @@
#define LAPIC_ICR0 0x300
#define LAPIC_ICR2 0x310
#define LAPIC_ESR 0x280
#define LOCAL_TIMER_VECTOR 0xef
#define LOCAL_PERF_VECTOR 0xf0
#define APIC_INT_LEVELTRIG 0x08000
#define APIC_INT_ASSERT 0x04000
@@ -80,6 +79,7 @@ static void (*lapic_icr_write)(unsigned int h, unsigned int l);
static void (*lapic_wait_icr_idle)(void);
void (*x86_issue_ipi)(unsigned int apicid, unsigned int low);
int running_on_kvm(void);
static void smp_func_call_handler(void);
void init_processors_local(int max_id);
void assign_processor_id(void);
@@ -148,7 +148,7 @@ extern char page_fault[], general_protection_exception[];
extern char debug_exception[], int3_exception[];
uint64_t boot_pat_state = 0;
int no_turbo = 0; /* May be updated by early parsing of kargs */
int no_turbo = 1; /* May be updated by early parsing of kargs */
extern int num_processors; /* kernel/ap.c */
struct pvclock_vsyscall_time_info *pvti = NULL;
@@ -844,6 +844,25 @@ void set_signal(int sig, void *regs, struct siginfo *info);
void check_signal(unsigned long, void *, int);
extern void tlb_flush_handler(int vector);
void __show_stack(uintptr_t *sp) {
while (((uintptr_t)sp >= 0xffff800000000000)
&& ((uintptr_t)sp < 0xffffffff80000000)) {
uintptr_t fp;
uintptr_t ip;
fp = sp[0];
ip = sp[1];
kprintf("IP: %016lx, SP: %016lx, FP: %016lx\n", ip, (uintptr_t)sp, fp);
sp = (void *)fp;
}
return;
}
void show_context_stack(uintptr_t *rbp) {
__show_stack(rbp);
return;
}
void handle_interrupt(int vector, struct x86_user_context *regs)
{
struct ihk_mc_interrupt_handler *h;
@@ -952,6 +971,12 @@ void handle_interrupt(int vector, struct x86_user_context *regs)
tlb_flush_handler(vector);
}
else if (vector == LOCAL_SMP_FUNC_CALL_VECTOR) {
smp_func_call_handler();
}
else if (vector == 133) {
show_context_stack((uintptr_t *)regs->gpr.rbp);
}
else {
list_for_each_entry(h, &handlers[vector - 32], list) {
if (h->func) {
@@ -1082,6 +1107,9 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
/* TODO */
ihk_mc_debug_show_interrupt_context(regs);
if (!(error & PF_USER)) {
panic("panic: kernel mode PF");
}
//dkprintf("now dump a core file\n");
//coredump(proc, regs);
@@ -1736,7 +1764,7 @@ int arch_setup_pvclock(void)
npages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
pvti_npages = npages;
pvti = allocate_pages(npages, IHK_MC_AP_NOWAIT);
pvti = ihk_mc_alloc_pages(npages, IHK_MC_AP_NOWAIT);
if (!pvti) {
ekprintf("arch_setup_pvclock: allocate_pages failed.\n");
return -ENOMEM;
@@ -1766,44 +1794,6 @@ void arch_start_pvclock(void)
return;
} /* arch_start_pvclock() */
static struct cpu_mapping *cpu_mapping = NULL;
int arch_get_cpu_mapping(struct cpu_mapping **buf, int *nelemsp)
{
int error;
size_t size;
int npages;
struct cpu_mapping *mapping;
int cpu;
struct x86_cpu_local_variables *v;
if (!cpu_mapping) {
size = sizeof(*mapping) * num_processors;
npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
mapping = allocate_pages(npages, IHK_MC_AP_NOWAIT);
if (!mapping) {
error = -ENOMEM;
ekprintf("arch_get_cpu_mapping:allocate_pages failed. %d\n", error);
goto out;
}
for (cpu = 0; cpu < num_processors; ++cpu) {
v = get_x86_cpu_local_variable(cpu);
mapping[cpu].cpu_number = cpu;
mapping[cpu].hw_id = v->apic_id;
}
cpu_mapping = mapping;
}
error = 0;
*buf = cpu_mapping;
*nelemsp = num_processors;
out:
return error;
} /* arch_get_cpu_mapping() */
#define KVM_CPUID_SIGNATURE 0x40000000
int running_on_kvm(void) {
@@ -1825,4 +1815,178 @@ int running_on_kvm(void) {
return 0;
}
void
mod_nmi_ctx(void *nmi_ctx, void (*func)())
{
unsigned long *l = nmi_ctx;
int i;
unsigned long flags;
asm volatile("pushf; pop %0" : "=r"(flags) : : "memory", "cc");
for (i = 0; i < 22; i++)
l[i] = l[i + 5];
l[i++] = (unsigned long)func; // return address
l[i++] = 0x20; // KERNEL CS
l[i++] = flags & ~RFLAGS_IF; // rflags (disable interrupt)
l[i++] = (unsigned long)(l + 27); // ols rsp
l[i++] = 0x28; // KERNEL DS
}
int arch_cpu_read_write_register(
struct ihk_os_cpu_register *desc,
enum mcctrl_os_cpu_operation op)
{
if (op == MCCTRL_OS_CPU_READ_REGISTER) {
desc->val = rdmsr(desc->addr);
}
else if (op == MCCTRL_OS_CPU_WRITE_REGISTER) {
wrmsr(desc->addr, desc->val);
}
else {
return -1;
}
return 0;
}
/*
* Generic remote CPU function invocation facility.
*/
static void smp_func_call_handler(void)
{
int irq_flags;
struct smp_func_call_request *req;
int reqs_left;
reiterate:
req = NULL;
reqs_left = 0;
irq_flags = ihk_mc_spinlock_lock(
&cpu_local_var(smp_func_req_lock));
/* Take requests one-by-one */
if (!list_empty(&cpu_local_var(smp_func_req_list))) {
req = list_first_entry(&cpu_local_var(smp_func_req_list),
struct smp_func_call_request, list);
list_del(&req->list);
reqs_left = !list_empty(&cpu_local_var(smp_func_req_list));
}
ihk_mc_spinlock_unlock(&cpu_local_var(smp_func_req_lock),
irq_flags);
if (req) {
req->ret = req->sfcd->func(req->cpu_index,
req->sfcd->nr_cpus, req->sfcd->arg);
ihk_atomic_dec(&req->sfcd->cpus_left);
}
if (reqs_left)
goto reiterate;
}
int smp_call_func(cpu_set_t *__cpu_set, smp_func_t __func, void *__arg)
{
int cpu, nr_cpus = 0;
int cpu_index = 0;
int this_cpu_index = 0;
struct smp_func_call_data sfcd;
struct smp_func_call_request *reqs;
int ret = 0;
int call_on_this_cpu = 0;
cpu_set_t cpu_set;
/* Sanity checks */
if (!__cpu_set || !__func) {
return -EINVAL;
}
/* Make sure it won't change in between */
cpu_set = *__cpu_set;
for_each_set_bit(cpu, (unsigned long *)&cpu_set,
sizeof(cpu_set) * BITS_PER_BYTE) {
if (cpu == ihk_mc_get_processor_id()) {
call_on_this_cpu = 1;
}
++nr_cpus;
}
if (!nr_cpus) {
return -EINVAL;
}
reqs = kmalloc(sizeof(*reqs) * nr_cpus, IHK_MC_AP_NOWAIT);
if (!reqs) {
ret = -ENOMEM;
goto free_out;
}
sfcd.nr_cpus = nr_cpus;
sfcd.func = __func;
sfcd.arg = __arg;
ihk_atomic_set(&sfcd.cpus_left,
call_on_this_cpu ? nr_cpus - 1 : nr_cpus);
/* Add requests and send IPIs */
cpu_index = 0;
for_each_set_bit(cpu, (unsigned long *)&cpu_set,
sizeof(cpu_set) * BITS_PER_BYTE) {
unsigned long irq_flags;
reqs[cpu_index].cpu_index = cpu_index;
reqs[cpu_index].ret = 0;
if (cpu == ihk_mc_get_processor_id()) {
this_cpu_index = cpu_index;
++cpu_index;
continue;
}
reqs[cpu_index].sfcd = &sfcd;
irq_flags =
ihk_mc_spinlock_lock(&get_cpu_local_var(cpu)->smp_func_req_lock);
list_add_tail(&reqs[cpu_index].list,
&get_cpu_local_var(cpu)->smp_func_req_list);
ihk_mc_spinlock_unlock(&get_cpu_local_var(cpu)->smp_func_req_lock,
irq_flags);
ihk_mc_interrupt_cpu(
get_x86_cpu_local_variable(cpu)->apic_id,
LOCAL_SMP_FUNC_CALL_VECTOR);
++cpu_index;
}
/* Is this CPU involved? */
if (call_on_this_cpu) {
reqs[this_cpu_index].ret =
__func(this_cpu_index, nr_cpus, __arg);
}
/* Wait for the rest of the CPUs */
while (ihk_atomic_read(&sfcd.cpus_left) > 0) {
cpu_pause();
}
/* Check return values, if error, report the first non-zero */
for (cpu_index = 0; cpu_index < nr_cpus; ++cpu_index) {
if (reqs[cpu_index].ret != 0) {
ret = reqs[cpu_index].ret;
goto free_out;
}
}
ret = 0;
free_out:
kfree(reqs);
return ret;
}
/*** end of file ***/

View File

@@ -182,7 +182,6 @@ void fill_prpsinfo(struct note *head, struct thread *thread, void *regs)
/*
We leave most of the fields unfilled.
char pr_state;
char pr_sname;
char pr_zomb;
char pr_nice;

View File

@@ -13,6 +13,8 @@
#ifndef HEADER_X86_COMMON_ARCH_BITOPS_H
#define HEADER_X86_COMMON_ARCH_BITOPS_H
#define ARCH_HAS_FAST_MULTIPLIER 1
static inline int fls(int x)
{
int r;

View File

@@ -131,6 +131,7 @@ static void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
typedef struct mcs_lock_node {
unsigned long locked;
struct mcs_lock_node *next;
unsigned long irqsave;
} __attribute__((aligned(64))) mcs_lock_node_t;
static void mcs_lock_init(struct mcs_lock_node *node)
@@ -139,7 +140,7 @@ static void mcs_lock_init(struct mcs_lock_node *node)
node->next = NULL;
}
static void mcs_lock_lock(struct mcs_lock_node *lock,
static void __mcs_lock_lock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
struct mcs_lock_node *pred;
@@ -158,7 +159,7 @@ static void mcs_lock_lock(struct mcs_lock_node *lock,
}
}
static void mcs_lock_unlock(struct mcs_lock_node *lock,
static void __mcs_lock_unlock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
if (node->next == NULL) {
@@ -178,6 +179,37 @@ static void mcs_lock_unlock(struct mcs_lock_node *lock,
node->next->locked = 0;
}
static void mcs_lock_lock_noirq(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
preempt_disable();
__mcs_lock_lock(lock, node);
}
static void mcs_lock_unlock_noirq(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
__mcs_lock_unlock(lock, node);
preempt_enable();
}
static void mcs_lock_lock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
node->irqsave = cpu_disable_interrupt_save();
mcs_lock_lock_noirq(lock, node);
}
static void mcs_lock_unlock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
mcs_lock_unlock_noirq(lock, node);
cpu_restore_interrupt(node->irqsave);
}
#define SPINLOCK_IN_MCS_RWLOCK
// reader/writer lock
typedef struct mcs_rwlock_node {
ihk_atomic_t count; // num of readers (use only common reader)
@@ -194,21 +226,31 @@ typedef struct mcs_rwlock_node {
} __attribute__((aligned(64))) mcs_rwlock_node_t;
typedef struct mcs_rwlock_node_irqsave {
#ifndef SPINLOCK_IN_MCS_RWLOCK
struct mcs_rwlock_node node;
#endif
unsigned long irqsave;
} __attribute__((aligned(64))) mcs_rwlock_node_irqsave_t;
typedef struct mcs_rwlock_lock {
#ifdef SPINLOCK_IN_MCS_RWLOCK
ihk_spinlock_t slock;
#else
struct mcs_rwlock_node reader; /* common reader lock */
struct mcs_rwlock_node *node; /* base */
#endif
} __attribute__((aligned(64))) mcs_rwlock_lock_t;
static void
mcs_rwlock_init(struct mcs_rwlock_lock *lock)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
ihk_mc_spinlock_init(&lock->slock);
#else
ihk_atomic_set(&lock->reader.count, 0);
lock->reader.type = MCS_RWLOCK_TYPE_COMMON_READER;
lock->node = NULL;
#endif
}
#ifdef DEBUG_MCS_RWLOCK
@@ -223,6 +265,9 @@ __kprintf("[%d] ret mcs_rwlock_writer_lock_noirq\n", ihk_mc_get_processor_id());
static void
__mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
ihk_mc_spinlock_lock_noirq(&lock->slock);
#else
struct mcs_rwlock_node *pred;
preempt_disable();
@@ -240,8 +285,10 @@ __mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_n
cpu_pause();
}
}
#endif
}
#ifndef SPINLOCK_IN_MCS_RWLOCK
static void
mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
{
@@ -298,6 +345,7 @@ mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
f->locked = MCS_RWLOCK_UNLOCKED;
}
#endif
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_writer_unlock_noirq(l, n) { \
@@ -311,6 +359,9 @@ __kprintf("[%d] ret mcs_rwlock_writer_unlock_noirq\n", ihk_mc_get_processor_id()
static void
__mcs_rwlock_writer_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
ihk_mc_spinlock_unlock_noirq(&lock->slock);
#else
if (node->next == NULL) {
struct mcs_rwlock_node *old = (struct mcs_rwlock_node *)
atomic_cmpxchg8((unsigned long *)&lock->node,
@@ -335,6 +386,7 @@ __mcs_rwlock_writer_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock
out:
preempt_enable();
#endif
}
#ifdef DEBUG_MCS_RWLOCK
@@ -367,6 +419,9 @@ atomic_inc_ifnot0(ihk_atomic_t *v)
static void
__mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
ihk_mc_spinlock_lock_noirq(&lock->slock);
#else
struct mcs_rwlock_node *pred;
preempt_disable();
@@ -415,6 +470,7 @@ __mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_n
}
out:
return;
#endif
}
#ifdef DEBUG_MCS_RWLOCK
@@ -429,6 +485,9 @@ __kprintf("[%d] ret mcs_rwlock_reader_unlock_noirq\n", ihk_mc_get_processor_id()
static void
__mcs_rwlock_reader_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
ihk_mc_spinlock_unlock_noirq(&lock->slock);
#else
if(ihk_atomic_dec_return(&lock->reader.count))
goto out;
@@ -458,6 +517,7 @@ __mcs_rwlock_reader_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock
out:
preempt_enable();
#endif
}
#ifdef DEBUG_MCS_RWLOCK
@@ -472,8 +532,12 @@ __kprintf("[%d] ret mcs_rwlock_writer_lock\n", ihk_mc_get_processor_id()); \
static void
__mcs_rwlock_writer_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
node->irqsave = ihk_mc_spinlock_lock(&lock->slock);
#else
node->irqsave = cpu_disable_interrupt_save();
__mcs_rwlock_writer_lock_noirq(lock, &node->node);
#endif
}
#ifdef DEBUG_MCS_RWLOCK
@@ -488,8 +552,12 @@ __kprintf("[%d] ret mcs_rwlock_writer_unlock\n", ihk_mc_get_processor_id()); \
static void
__mcs_rwlock_writer_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
ihk_mc_spinlock_unlock(&lock->slock, node->irqsave);
#else
__mcs_rwlock_writer_unlock_noirq(lock, &node->node);
cpu_restore_interrupt(node->irqsave);
#endif
}
#ifdef DEBUG_MCS_RWLOCK
@@ -504,8 +572,12 @@ __kprintf("[%d] ret mcs_rwlock_reader_lock\n", ihk_mc_get_processor_id()); \
static void
__mcs_rwlock_reader_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
node->irqsave = ihk_mc_spinlock_lock(&lock->slock);
#else
node->irqsave = cpu_disable_interrupt_save();
__mcs_rwlock_reader_lock_noirq(lock, &node->node);
#endif
}
#ifdef DEBUG_MCS_RWLOCK
@@ -520,8 +592,12 @@ __kprintf("[%d] ret mcs_rwlock_reader_unlock\n", ihk_mc_get_processor_id()); \
static void
__mcs_rwlock_reader_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
ihk_mc_spinlock_unlock(&lock->slock, node->irqsave);
#else
__mcs_rwlock_reader_unlock_noirq(lock, &node->node);
cpu_restore_interrupt(node->irqsave);
#endif
}
#endif

View File

@@ -204,6 +204,11 @@ static inline int pte_is_fileoff(pte_t *ptep, size_t pgsize)
}
}
static inline void pte_update_phys(pte_t *ptep, unsigned long phys)
{
*ptep = (*ptep & ~PT_PHYSMASK) | (phys & PT_PHYSMASK);
}
static inline uintptr_t pte_get_phys(pte_t *ptep)
{
return (*ptep & PT_PHYSMASK);
@@ -306,7 +311,7 @@ struct page_table;
void set_pte(pte_t *ppte, unsigned long phys, enum ihk_mc_pt_attribute attr);
pte_t *get_pte(struct page_table *pt, void *virt, enum ihk_mc_pt_attribute attr);
void *early_alloc_page(void);
void *early_alloc_pages(int nr_pages);
void *get_last_early_heap(void);
void flush_tlb(void);
void flush_tlb_single(unsigned long addr);

View File

@@ -0,0 +1,42 @@
#ifndef _ASM_X86_STRING_H
#define _ASM_X86_STRING_H
#define ARCH_FAST_MEMCPY
static inline void *__inline_memcpy(void *to, const void *from, size_t n)
{
unsigned long d0, d1, d2;
asm volatile("rep ; movsl\n\t"
"testb $2,%b4\n\t"
"je 1f\n\t"
"movsw\n"
"1:\ttestb $1,%b4\n\t"
"je 2f\n\t"
"movsb\n"
"2:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
: "0" (n / 4), "q" (n), "1" ((long)to), "2" ((long)from)
: "memory");
return to;
}
#define ARCH_FAST_MEMSET
static inline void *__inline_memset(void *s, unsigned long c, size_t count)
{
int d0, d1;
asm volatile("rep ; stosl\n\t"
"testb $2,%b3\n\t"
"je 1f\n\t"
"stosw\n"
"1:\ttestb $1,%b3\n\t"
"je 2f\n\t"
"stosb\n"
"2:"
: "=&c" (d0), "=&D" (d1)
: "a" (c), "q" (count), "0" (count/4), "1" ((long)s)
: "memory");
return s;
}
#endif

View File

@@ -215,4 +215,25 @@ static inline unsigned long atomic_cmpxchg4(unsigned int *addr,
return oldval;
}
static inline void ihk_atomic_add_long(long i, long *v) {
asm volatile("lock addq %1,%0"
: "+m" (*v)
: "ir" (i));
}
static inline void ihk_atomic_add_ulong(long i, unsigned long *v) {
asm volatile("lock addq %1,%0"
: "+m" (*v)
: "ir" (i));
}
static inline unsigned long ihk_atomic_add_long_return(long i, long *v) {
long __i;
__i = i;
asm volatile("lock xaddq %0, %1"
: "+r" (i), "+m" (*v)
: : "memory");
return i + __i;
}
#endif

View File

@@ -15,6 +15,9 @@
#include <ikc/ihk.h>
#define IKC_PORT_IKC2MCKERNEL 501
#define IKC_PORT_IKC2LINUX 503
/* manycore side */
int ihk_mc_ikc_init_first(struct ihk_ikc_channel_desc *,
ihk_ikc_ph_t handler);

View File

@@ -215,6 +215,7 @@ struct x86_sregs {
* bit 4 == 1: fault was an instruction fetch
*
* internal use:
* bit 29 == 1: Make PF map text modified by ptrace_poketext()
* bit 30 == 1: don't use COW page to resolve page fault.
*/
enum x86_pf_error_code {

View File

@@ -22,7 +22,7 @@
SYSCALL_HANDLED(0, read)
SYSCALL_DELEGATED(1, write)
SYSCALL_DELEGATED(2, open)
SYSCALL_HANDLED(2, open)
SYSCALL_HANDLED(3, close)
SYSCALL_DELEGATED(4, stat)
SYSCALL_DELEGATED(5, fstat)
@@ -150,5 +150,11 @@ SYSCALL_HANDLED(602, pmc_start)
SYSCALL_HANDLED(603, pmc_stop)
SYSCALL_HANDLED(604, pmc_reset)
SYSCALL_HANDLED(700, get_cpu_id)
#ifdef PROFILE_ENABLE
SYSCALL_HANDLED(__NR_profile, profile)
#endif // PROFILE_ENABLE
SYSCALL_HANDLED(730, util_migrate_inter_kernel)
SYSCALL_HANDLED(731, util_indicate_clone)
SYSCALL_HANDLED(732, get_system)
/**** End of File ****/

View File

@@ -130,11 +130,40 @@ general_protection_exception:
addq $8, %rsp
iretq
.global __freeze
__freeze:
PUSH_ALL_REGS
callq freeze
POP_ALL_REGS
iretq
.globl nmi
nmi:
#define PANICED 232
#define PANIC_REGS 240
movq %rax,%gs:PANIC_REGS+0x00
movq %rsp,%gs:PANIC_REGS+0x08
movl nmi_mode(%rip),%eax
cmp $1,%rax
je 1f
cmp $2,%rax
jne 3f
1:
cld
movq %gs:PANIC_REGS+0x00,%rax
PUSH_ALL_REGS
subq $40, %rsp
movq %rsp,%gs:PANIC_REGS+0x10
movq %rsp, %rdi
call freeze_thaw
cmpq $0, %rax
jnz 2f
addq $40, %rsp
2:
POP_ALL_REGS
iretq
3:
movq %rbx,%gs:PANIC_REGS+0x08
movq %rcx,%gs:PANIC_REGS+0x10
movq %rdx,%gs:PANIC_REGS+0x18
@@ -210,6 +239,7 @@ enter_user_mode:
movq $0, %rdi
movq %rsp, %rsi
call check_signal
call utilthr_migrate
movq $0, %rdi
call set_cputime
POP_ALL_REGS

View File

@@ -31,11 +31,10 @@
static char *last_page;
extern char _head[], _end[];
static struct ihk_mc_pa_ops *pa_ops;
extern unsigned long x86_kernel_phys_base;
void *early_alloc_page(void)
/* Arch specific early allocation routine */
void *early_alloc_pages(int nr_pages)
{
void *p;
@@ -46,43 +45,20 @@ void *early_alloc_page(void)
last_page = phys_to_virt(virt_to_phys(last_page));
} else if (last_page == (void *)-1) {
panic("Early allocator is already finalized. Do not use it.\n");
}
} else {
if(virt_to_phys(last_page) >= bootstrap_mem_end) {
panic("Early allocator: Out of memory\n");
}
}
p = last_page;
last_page += PAGE_SIZE;
last_page += (nr_pages * PAGE_SIZE);
return p;
}
void *arch_alloc_page(enum ihk_mc_ap_flag flag)
void early_alloc_invalidate(void)
{
if (pa_ops)
return pa_ops->alloc_page(1, PAGE_P2ALIGN, flag);
else
return early_alloc_page();
}
void arch_free_page(void *ptr)
{
if (pa_ops)
pa_ops->free_page(ptr, 1);
}
void *ihk_mc_alloc_aligned_pages(int npages, int p2align, enum ihk_mc_ap_flag flag)
{
if (pa_ops)
return pa_ops->alloc_page(npages, p2align, flag);
else
return NULL;
}
void *ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag)
{
return ihk_mc_alloc_aligned_pages(npages, PAGE_P2ALIGN, flag);
}
void ihk_mc_free_pages(void *p, int npages)
{
if (pa_ops)
pa_ops->free_page(p, npages);
last_page = (void *)-1;
}
void *ihk_mc_allocate(int size, int flag)
@@ -175,7 +151,7 @@ static unsigned long setup_l3(struct page_table *pt,
pt->entry[i] = 0;
continue;
}
pt_phys = setup_l2(arch_alloc_page(IHK_MC_AP_CRITICAL), phys, start, end);
pt_phys = setup_l2(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL), phys, start, end);
pt->entry[i] = pt_phys | PFL3_PDIR_ATTR;
}
@@ -199,7 +175,7 @@ static void init_normal_area(struct page_table *pt)
for (phys = (map_start & ~(PTL4_SIZE - 1)); phys < map_end;
phys += PTL4_SIZE) {
pt_phys = setup_l3(arch_alloc_page(IHK_MC_AP_CRITICAL), phys,
pt_phys = setup_l3(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL), phys,
map_start, map_end);
pt->entry[ident_index++] = pt_phys | PFL4_PDIR_ATTR;
@@ -207,9 +183,9 @@ static void init_normal_area(struct page_table *pt)
}
}
static struct page_table *__alloc_new_pt(enum ihk_mc_ap_flag ap_flag)
static struct page_table *__alloc_new_pt(ihk_mc_ap_flag ap_flag)
{
struct page_table *newpt = arch_alloc_page(ap_flag);
struct page_table *newpt = ihk_mc_alloc_pages(1, ap_flag);
if(newpt)
memset(newpt, 0, sizeof(struct page_table));
@@ -306,7 +282,7 @@ void set_pte(pte_t *ppte, unsigned long phys, enum ihk_mc_pt_attribute attr)
* and returns a pointer to the PTE corresponding to the
* virtual address.
*/
pte_t *get_pte(struct page_table *pt, void *virt, enum ihk_mc_pt_attribute attr, enum ihk_mc_ap_flag ap_flag)
pte_t *get_pte(struct page_table *pt, void *virt, enum ihk_mc_pt_attribute attr, ihk_mc_ap_flag ap_flag)
{
int l4idx, l3idx, l2idx, l1idx;
unsigned long v = (unsigned long)virt;
@@ -367,7 +343,7 @@ static int __set_pt_page(struct page_table *pt, void *virt, unsigned long phys,
int l4idx, l3idx, l2idx, l1idx;
unsigned long v = (unsigned long)virt;
struct page_table *newpt;
enum ihk_mc_ap_flag ap_flag;
ihk_mc_ap_flag ap_flag;
int in_kernel =
(((unsigned long long)virt) >= 0xffff000000000000ULL);
unsigned long init_pt_lock_flags;
@@ -518,8 +494,10 @@ uint64_t ihk_mc_pt_virt_to_pagemap(struct page_table *pt, unsigned long virt)
return pagemap;
}
int ihk_mc_pt_virt_to_phys(struct page_table *pt,
const void *virt, unsigned long *phys)
int ihk_mc_pt_virt_to_phys_size(struct page_table *pt,
const void *virt,
unsigned long *phys,
unsigned long *size)
{
int l4idx, l3idx, l2idx, l1idx;
unsigned long v = (unsigned long)virt;
@@ -541,6 +519,7 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt,
if ((pt->entry[l3idx] & PFL3_SIZE)) {
*phys = pte_get_phys(&pt->entry[l3idx])
| (v & (PTL3_SIZE - 1));
if (size) *size = PTL3_SIZE;
return 0;
}
pt = phys_to_virt(pte_get_phys(&pt->entry[l3idx]));
@@ -551,6 +530,7 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt,
if ((pt->entry[l2idx] & PFL2_SIZE)) {
*phys = pte_get_phys(&pt->entry[l2idx])
| (v & (PTL2_SIZE - 1));
if (size) *size = PTL2_SIZE;
return 0;
}
pt = phys_to_virt(pte_get_phys(&pt->entry[l2idx]));
@@ -560,9 +540,17 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt,
}
*phys = pte_get_phys(&pt->entry[l1idx]) | (v & (PTL1_SIZE - 1));
if (size) *size = PTL1_SIZE;
return 0;
}
int ihk_mc_pt_virt_to_phys(struct page_table *pt,
const void *virt, unsigned long *phys)
{
return ihk_mc_pt_virt_to_phys_size(pt, virt, phys, NULL);
}
int ihk_mc_pt_print_pte(struct page_table *pt, void *virt)
{
int l4idx, l3idx, l2idx, l1idx;
@@ -574,28 +562,34 @@ int ihk_mc_pt_print_pte(struct page_table *pt, void *virt)
GET_VIRT_INDICES(v, l4idx, l3idx, l2idx, l1idx);
__kprintf("l4 table: 0x%lX l4idx: %d \n", virt_to_phys(pt), l4idx);
if (!(pt->entry[l4idx] & PFL4_PRESENT)) {
__kprintf("0x%lX l4idx not present! \n", (unsigned long)virt);
__kprintf("l4 entry: 0x%lX\n", pt->entry[l4idx]);
return -EFAULT;
}
__kprintf("l4 entry: 0x%lX\n", pt->entry[l4idx]);
pt = phys_to_virt(pt->entry[l4idx] & PAGE_MASK);
__kprintf("l3 table: 0x%lX l3idx: %d \n", virt_to_phys(pt), l3idx);
if (!(pt->entry[l3idx] & PFL3_PRESENT)) {
__kprintf("0x%lX l3idx not present! \n", (unsigned long)virt);
__kprintf("l3 entry: 0x%lX\n", pt->entry[l3idx]);
return -EFAULT;
}
__kprintf("l3 entry: 0x%lX\n", pt->entry[l3idx]);
if ((pt->entry[l3idx] & PFL3_SIZE)) {
__kprintf("l3 entry is 1G page\n");
return 0;
}
pt = phys_to_virt(pt->entry[l3idx] & PAGE_MASK);
__kprintf("l2 table: 0x%lX l2idx: %d \n", virt_to_phys(pt), l2idx);
if (!(pt->entry[l2idx] & PFL2_PRESENT)) {
__kprintf("0x%lX l2idx not present! \n", (unsigned long)virt);
__kprintf("l2 entry: 0x%lX\n", pt->entry[l2idx]);
return -EFAULT;
}
__kprintf("l2 entry: 0x%lX\n", pt->entry[l2idx]);
if ((pt->entry[l2idx] & PFL2_SIZE)) {
__kprintf("l2 entry is 2M page\n");
return 0;
}
pt = phys_to_virt(pt->entry[l2idx] & PAGE_MASK);
@@ -674,7 +668,7 @@ int ihk_mc_pt_prepare_map(page_table_t p, void *virt, unsigned long size,
return ret;
}
struct page_table *ihk_mc_pt_create(enum ihk_mc_ap_flag ap_flag)
struct page_table *ihk_mc_pt_create(ihk_mc_ap_flag ap_flag)
{
struct page_table *pt = ihk_mc_alloc_pages(1, ap_flag);
@@ -718,7 +712,7 @@ static void destroy_page_table(int level, struct page_table *pt)
}
}
arch_free_page(pt);
ihk_mc_free_pages(pt, 1);
return;
}
@@ -1081,11 +1075,29 @@ int visit_pte_range(page_table_t pt, void *start0, void *end0, int pgshift,
struct clear_range_args {
int free_physical;
uint8_t padding[4];
struct memobj *memobj;
struct process_vm *vm;
unsigned long *addr;
int nr_addr;
int max_nr_addr;
};
static void remote_flush_tlb_add_addr(struct clear_range_args *args,
unsigned long addr)
{
if (args->nr_addr < args->max_nr_addr) {
args->addr[args->nr_addr] = addr;
++args->nr_addr;
return;
}
remote_flush_tlb_array_cpumask(args->vm, args->addr, args->nr_addr,
ihk_mc_get_processor_id());
args->addr[0] = addr;
args->nr_addr = 1;
}
static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base,
uint64_t start, uint64_t end)
{
@@ -1099,7 +1111,7 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base,
}
old = xchg(ptep, PTE_NULL);
remote_flush_tlb_cpumask(args->vm, base, ihk_mc_get_processor_id());
remote_flush_tlb_add_addr(args, base);
page = NULL;
if (!pte_is_fileoff(&old, PTL1_SIZE)) {
@@ -1107,13 +1119,14 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base,
page = phys_to_page(phys);
}
if (page && page_is_in_memobj(page) && (old & PFL1_DIRTY)) {
if (page && page_is_in_memobj(page) && (old & PFL1_DIRTY) && (args->memobj) &&
!(args->memobj->flags & MF_ZEROFILL)) {
memobj_flush_page(args->memobj, phys, PTL1_SIZE);
}
if (!(old & PFL1_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) {
ihk_mc_free_pages(phys_to_virt(phys), 1);
if (!page || (page && page_unmap(page))) {
ihk_mc_free_pages_user(phys_to_virt(phys), 1);
dkprintf("%s: freeing regular page at 0x%lx\n", __FUNCTION__, base);
}
args->vm->currss -= PTL1_SIZE;
@@ -1147,8 +1160,7 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base,
if (*ptep & PFL2_SIZE) {
old = xchg(ptep, PTE_NULL);
remote_flush_tlb_cpumask(args->vm, base,
ihk_mc_get_processor_id());
remote_flush_tlb_add_addr(args, base);
page = NULL;
if (!pte_is_fileoff(&old, PTL2_SIZE)) {
@@ -1161,8 +1173,9 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base,
}
if (!(old & PFL2_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) {
ihk_mc_free_pages(phys_to_virt(phys), PTL2_SIZE/PTL1_SIZE);
if (!page || (page && page_unmap(page))) {
ihk_mc_free_pages_user(phys_to_virt(phys),
PTL2_SIZE/PTL1_SIZE);
dkprintf("%s: freeing large page at 0x%lx\n", __FUNCTION__, base);
}
args->vm->currss -= PTL2_SIZE;
@@ -1179,9 +1192,8 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base,
if ((start <= base) && ((base + PTL2_SIZE) <= end)) {
*ptep = PTE_NULL;
remote_flush_tlb_cpumask(args->vm, base,
ihk_mc_get_processor_id());
arch_free_page(pt);
remote_flush_tlb_add_addr(args, base);
ihk_mc_free_pages(pt, 1);
}
return 0;
@@ -1212,8 +1224,7 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
if (*ptep & PFL3_SIZE) {
old = xchg(ptep, PTE_NULL);
remote_flush_tlb_cpumask(args->vm, base,
ihk_mc_get_processor_id());
remote_flush_tlb_add_addr(args, base);
page = NULL;
if (!pte_is_fileoff(&old, PTL3_SIZE)) {
@@ -1226,8 +1237,9 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
}
if (!(old & PFL3_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) {
ihk_mc_free_pages(phys_to_virt(phys), PTL3_SIZE/PTL1_SIZE);
if (!page || (page && page_unmap(page))) {
ihk_mc_free_pages_user(phys_to_virt(phys),
PTL3_SIZE/PTL1_SIZE);
}
args->vm->currss -= PTL3_SIZE;
}
@@ -1243,9 +1255,8 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
if (use_1gb_page && (start <= base) && ((base + PTL3_SIZE) <= end)) {
*ptep = PTE_NULL;
remote_flush_tlb_cpumask(args->vm, base,
ihk_mc_get_processor_id());
arch_free_page(pt);
remote_flush_tlb_add_addr(args, base);
ihk_mc_free_pages(pt, 1);
}
return 0;
@@ -1264,8 +1275,10 @@ static int clear_range_l4(void *args0, pte_t *ptep, uint64_t base,
return walk_pte_l3(pt, base, start, end, &clear_range_l3, args0);
}
static int clear_range(struct page_table *pt, struct process_vm *vm,
uintptr_t start, uintptr_t end, int free_physical,
#define TLB_INVALID_ARRAY_PAGES (4)
static int clear_range(struct page_table *pt, struct process_vm *vm,
uintptr_t start, uintptr_t end, int free_physical,
struct memobj *memobj)
{
int error;
@@ -1280,11 +1293,35 @@ static int clear_range(struct page_table *pt, struct process_vm *vm,
return -EINVAL;
}
/* TODO: embedd this in tlb_flush_entry? */
args.addr = (unsigned long *)ihk_mc_alloc_pages(
TLB_INVALID_ARRAY_PAGES, IHK_MC_AP_CRITICAL);
if (!args.addr) {
ekprintf("%s: error: allocating address array\n", __FUNCTION__);
return -ENOMEM;
}
args.nr_addr = 0;
args.max_nr_addr = (TLB_INVALID_ARRAY_PAGES * PAGE_SIZE /
sizeof(uint64_t));
args.free_physical = free_physical;
if (memobj && (memobj->flags & MF_DEV_FILE)) {
args.free_physical = 0;
}
if (memobj && ((memobj->flags & MF_PREMAP))) {
args.free_physical = 0;
}
args.memobj = memobj;
args.vm = vm;
error = walk_pte_l4(pt, 0, start, end, &clear_range_l4, &args);
if (args.nr_addr) {
remote_flush_tlb_array_cpumask(vm, args.addr, args.nr_addr,
ihk_mc_get_processor_id());
}
ihk_mc_free_pages(args.addr, TLB_INVALID_ARRAY_PAGES);
return error;
}
@@ -1596,7 +1633,7 @@ retry:
error = 0;
out:
if (newpt) {
arch_free_page(newpt);
ihk_mc_free_pages(newpt, 1);
}
dkprintf("set_range_l2(%lx,%lx,%lx): %d %lx\n",
base, start, end, error, *ptep);
@@ -1679,7 +1716,7 @@ retry:
error = 0;
out:
if (newpt) {
arch_free_page(newpt);
ihk_mc_free_pages(newpt, 1);
}
dkprintf("set_range_l3(%lx,%lx,%lx): %d\n",
base, start, end, error, *ptep);
@@ -1737,7 +1774,7 @@ retry:
error = 0;
out:
if (newpt) {
arch_free_page(newpt);
ihk_mc_free_pages(newpt, 1);
}
dkprintf("set_range_l4(%lx,%lx,%lx): %d %lx\n",
base, start, end, error, *ptep);
@@ -1789,9 +1826,19 @@ int ihk_mc_pt_set_pte(page_table_t pt, pte_t *ptep, size_t pgsize,
*ptep = phys | attr_to_l1attr(attr);
}
else if (pgsize == PTL2_SIZE) {
if (phys & (PTL2_SIZE - 1)) {
kprintf("%s: error: phys needs to be PTL2_SIZE aligned\n", __FUNCTION__);
error = -1;
goto out;
}
*ptep = phys | attr_to_l2attr(attr | PTATTR_LARGEPAGE);
}
else if ((pgsize == PTL3_SIZE) && (use_1gb_page)) {
if (phys & (PTL3_SIZE - 1)) {
kprintf("%s: error: phys needs to be PTL3_SIZE aligned\n", __FUNCTION__);
error = -1;
goto out;
}
*ptep = phys | attr_to_l3attr(attr | PTATTR_LARGEPAGE);
}
else {
@@ -2055,7 +2102,8 @@ void *map_fixed_area(unsigned long phys, unsigned long size, int uncachable)
attr |= PTATTR_UNCACHABLE;
}
kprintf("map_fixed: %lx => %p (%d pages)\n", paligned, v, npages);
kprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
paligned, v, npages);
for (i = 0; i < npages; i++) {
if(__set_pt_page(init_pt, (void *)fixed_virt, paligned, attr)){
@@ -2094,7 +2142,7 @@ static void init_vsyscall_area(struct page_table *pt)
void init_page_table(void)
{
check_available_page_size();
init_pt = arch_alloc_page(IHK_MC_AP_CRITICAL);
init_pt = ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL);
ihk_mc_spinlock_init(&init_pt_lock);
memset(init_pt, 0, sizeof(PAGE_SIZE));
@@ -2111,27 +2159,27 @@ void init_page_table(void)
}
extern void __reserve_arch_pages(unsigned long, unsigned long,
void (*)(unsigned long, unsigned long, int));
void (*)(struct ihk_page_allocator_desc *,
unsigned long, unsigned long, int));
void ihk_mc_reserve_arch_pages(unsigned long start, unsigned long end,
void (*cb)(unsigned long, unsigned long, int))
void ihk_mc_reserve_arch_pages(struct ihk_page_allocator_desc *pa_allocator,
unsigned long start, unsigned long end,
void (*cb)(struct ihk_page_allocator_desc *,
unsigned long, unsigned long, int))
{
/* Reserve Text + temporal heap */
cb(virt_to_phys(_head), virt_to_phys(get_last_early_heap()), 0);
cb(pa_allocator, virt_to_phys(_head), virt_to_phys(get_last_early_heap()), 0);
/* Reserve trampoline area to boot the second ap */
cb(ap_trampoline, ap_trampoline + AP_TRAMPOLINE_SIZE, 0);
cb(pa_allocator, ap_trampoline, ap_trampoline + AP_TRAMPOLINE_SIZE, 0);
/* Reserve the null page */
cb(0, PAGE_SIZE, 0);
/* Micro-arch specific */
cb(pa_allocator, 0, PAGE_SIZE, 0);
/*
* Micro-arch specific
* TODO: this does nothing in SMP mode, update it for KNC if necessary
*/
__reserve_arch_pages(start, end, cb);
}
void ihk_mc_set_page_allocator(struct ihk_mc_pa_ops *ops)
{
last_page = (void *)-1;
pa_ops = ops;
}
unsigned long virt_to_phys(void *v)
{
unsigned long va = (unsigned long)v;
@@ -2158,26 +2206,18 @@ int copy_from_user(void *dst, const void *src, size_t siz)
int strlen_user(const char *s)
{
struct process_vm *vm = cpu_local_var(current)->vm;
struct vm_range *range;
unsigned long pgstart;
int maxlen;
const char *head = s;
int err;
maxlen = 4096 - (((unsigned long)s) & 0x0000000000000fffUL);
pgstart = ((unsigned long)s) & 0xfffffffffffff000UL;
if(!pgstart || pgstart >= MAP_KERNEL_START)
return -EFAULT;
ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock);
for(;;){
range = lookup_process_memory_range(vm, pgstart, pgstart+1);
if(range == NULL){
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
return -EFAULT;
}
if((range->flag & VR_PROT_MASK) == VR_PROT_NONE){
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
return -EFAULT;
}
if ((err = verify_process_vm(vm, s, 1)))
return err;
while(*s && maxlen > 0){
s++;
maxlen--;
@@ -2187,14 +2227,12 @@ int strlen_user(const char *s)
maxlen = 4096;
pgstart += 4096;
}
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
return s - head;
}
int strcpy_from_user(char *dst, const char *src)
{
struct process_vm *vm = cpu_local_var(current)->vm;
struct vm_range *range;
unsigned long pgstart;
int maxlen;
int err = 0;
@@ -2203,17 +2241,9 @@ int strcpy_from_user(char *dst, const char *src)
pgstart = ((unsigned long)src) & 0xfffffffffffff000UL;
if(!pgstart || pgstart >= MAP_KERNEL_START)
return -EFAULT;
ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock);
for(;;){
range = lookup_process_memory_range(vm, pgstart, pgstart + 1);
if(range == NULL){
err = -EFAULT;
break;
}
if((range->flag & VR_PROT_MASK) == VR_PROT_NONE){
err = -EFAULT;
break;
}
if ((err = verify_process_vm(vm, src, 1)))
return err;
while(*src && maxlen > 0){
*(dst++) = *(src++);
maxlen--;
@@ -2225,34 +2255,62 @@ int strcpy_from_user(char *dst, const char *src)
maxlen = 4096;
pgstart += 4096;
}
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
return err;
}
long getlong_user(const long *p)
long getlong_user(long *dest, const long *p)
{
int error;
long l;
error = copy_from_user(&l, p, sizeof(l));
error = copy_from_user(dest, p, sizeof(long));
if (error) {
return error;
}
return l;
return 0;
}
int getint_user(const int *p)
int getint_user(int *dest, const int *p)
{
int error;
int i;
error = copy_from_user(&i, p, sizeof(i));
error = copy_from_user(dest, p, sizeof(int));
if (error) {
return error;
}
return i;
return 0;
}
int verify_process_vm(struct process_vm *vm,
const void *usrc, size_t size)
{
const uintptr_t ustart = (uintptr_t)usrc;
const uintptr_t uend = ustart + size;
uint64_t reason;
uintptr_t addr;
int error = 0;
if ((ustart < vm->region.user_start)
|| (vm->region.user_end <= ustart)
|| ((vm->region.user_end - ustart) < size)) {
kprintf("%s: error: out of user range\n", __FUNCTION__);
return -EFAULT;
}
reason = PF_USER; /* page not present */
for (addr = ustart & PAGE_MASK; addr < uend; addr += PAGE_SIZE) {
if (!addr)
return -EINVAL;
error = page_fault_process_vm(vm, (void *)addr, reason);
if (error) {
kprintf("%s: error: PF for %p failed\n", __FUNCTION__, addr);
return error;
}
}
return error;
}
int read_process_vm(struct process_vm *vm, void *kdst, const void *usrc, size_t siz)
@@ -2383,8 +2441,18 @@ int write_process_vm(struct process_vm *vm, void *udst, const void *ksrc, size_t
return error;
}
va = phys_to_virt(pa);
memcpy(va, from, cpsize);
if (pa < ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0) ||
pa >= ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0)) {
dkprintf("%s: pa is outside of LWK memory, from: %p,"
"pa: %p, cpsize: %d\n", __FUNCTION__, from, pa, cpsize);
va = ihk_mc_map_virtual(pa, 1, PTATTR_ACTIVE);
memcpy(va, from, cpsize);
ihk_mc_unmap_virtual(va, 1, 1);
}
else {
va = phys_to_virt(pa);
memcpy(va, from, cpsize);
}
from += cpsize;
to += cpsize;
@@ -2408,7 +2476,7 @@ int patch_process_vm(struct process_vm *vm, void *udst, const void *ksrc, size_t
unsigned long pa;
void *va;
kprintf("patch_process_vm(%p,%p,%p,%lx)\n", vm, udst, ksrc, siz);
dkprintf("patch_process_vm(%p,%p,%p,%lx)\n", vm, udst, ksrc, siz);
if ((ustart < vm->region.user_start)
|| (vm->region.user_end <= ustart)
|| ((vm->region.user_end - ustart) < siz)) {
@@ -2458,6 +2526,6 @@ int patch_process_vm(struct process_vm *vm, void *udst, const void *ksrc, size_t
remain -= cpsize;
}
kprintf("patch_process_vm(%p,%p,%p,%lx):%d\n", vm, udst, ksrc, siz, 0);
dkprintf("patch_process_vm(%p,%p,%p,%lx):%d\n", vm, udst, ksrc, siz, 0);
return 0;
} /* patch_process_vm() */

View File

@@ -16,6 +16,7 @@
#include <memory.h>
#include <string.h>
extern int num_processors;
extern void arch_set_mikc_queue(void *r, void *w);
ihk_ikc_ph_t arch_master_channel_packet_handler;
@@ -23,17 +24,23 @@ int ihk_mc_ikc_init_first_local(struct ihk_ikc_channel_desc *channel,
ihk_ikc_ph_t packet_handler)
{
struct ihk_ikc_queue_head *rq, *wq;
size_t mikc_queue_pages;
ihk_ikc_system_init(NULL);
memset(channel, 0, sizeof(struct ihk_ikc_channel_desc));
/* Place both sides in this side */
rq = arch_alloc_page(IHK_MC_AP_CRITICAL);
wq = arch_alloc_page(IHK_MC_AP_CRITICAL);
mikc_queue_pages = ((2 * num_processors * MASTER_IKCQ_PKTSIZE)
+ (PAGE_SIZE - 1)) / PAGE_SIZE;
ihk_ikc_init_queue(rq, 0, 0, PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
ihk_ikc_init_queue(wq, 0, 0, PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
/* Place both sides in this side */
rq = ihk_mc_alloc_pages(mikc_queue_pages, IHK_MC_AP_CRITICAL);
wq = ihk_mc_alloc_pages(mikc_queue_pages, IHK_MC_AP_CRITICAL);
ihk_ikc_init_queue(rq, 0, 0,
mikc_queue_pages * PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
ihk_ikc_init_queue(wq, 0, 0,
mikc_queue_pages * PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
arch_master_channel_packet_handler = packet_handler;

View File

@@ -17,8 +17,26 @@
extern unsigned int *x86_march_perfmap;
extern int running_on_kvm(void);
//#define PERFCTR_DEBUG
#ifdef PERFCTR_DEBUG
#define dkprintf(...) do { kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) do { kprintf(__VA_ARGS__); } while (0)
#else
#define dkprintf(...) do { } while (0)
#define ekprintf(...) do { kprintf(__VA_ARGS__); } while (0)
#endif
#define X86_CR4_PCE 0x00000100
#define PERFCTR_CHKANDJUMP(cond, msg, err) \
do { \
if(cond) { \
ekprintf("%s,"msg"\n", __FUNCTION__); \
ret = err; \
goto fn_fail; \
} \
} while(0)
int perf_counters_discovered = 0;
int X86_IA32_NUM_PERF_COUNTERS = 0;
unsigned long X86_IA32_PERF_COUNTERS_MASK = 0;
@@ -203,9 +221,12 @@ extern void x86_march_perfctr_start(unsigned long counter_mask);
int ihk_mc_perfctr_start(unsigned long counter_mask)
{
int ret = 0;
unsigned long value = 0;
unsigned long mask = X86_IA32_PERF_COUNTERS_MASK | X86_IA32_FIXED_PERF_COUNTERS_MASK;
PERFCTR_CHKANDJUMP(counter_mask & ~mask, "counter_mask out of range", -EINVAL);
#ifdef HAVE_MARCH_PERFCTR_START
x86_march_perfctr_start(counter_mask);
#endif
@@ -213,15 +234,20 @@ int ihk_mc_perfctr_start(unsigned long counter_mask)
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= counter_mask;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
return 0;
fn_exit:
return ret;
fn_fail:
goto fn_exit;
}
int ihk_mc_perfctr_stop(unsigned long counter_mask)
{
int ret = 0;
unsigned long value;
unsigned long mask = X86_IA32_PERF_COUNTERS_MASK | X86_IA32_FIXED_PERF_COUNTERS_MASK;
PERFCTR_CHKANDJUMP(counter_mask & ~mask, "counter_mask out of range", -EINVAL);
counter_mask &= mask;
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value &= ~counter_mask;
@@ -244,8 +270,10 @@ int ihk_mc_perfctr_stop(unsigned long counter_mask)
value &= ~(0xf << 8);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
return 0;
fn_exit:
return ret;
fn_fail:
goto fn_exit;
}
// init for fixed counter

View File

@@ -70,71 +70,37 @@ static struct vdso vdso;
static size_t container_size = 0;
static ptrdiff_t vdso_offset;
/*
See dkprintf("BSP HW ID = %d, ", bsp_hw_id); (in ./mcos/kernel/ap.c)
extern int num_processors;
Core with BSP HW ID 224 is 1st logical core of last physical core.
It boots first and is given SW-ID of 0
int obtain_clone_cpuid(cpu_set_t *cpu_set) {
int min_queue_len = -1;
int cpu, min_cpu = -1;
Core with BSP HW ID 0 is 1st logical core of 1st physical core.
It boots next and is given SW-ID of 1.
Core with BSP HW ID 1 boots next and is given SW-ID of 2.
Core with BSP HW ID 2 boots next and is given SW-ID of 3.
Core with BSP HW ID 3 boots next and is given SW-ID of 4.
...
Core with BSP HW ID 220 is 1st logical core of 56-th physical core.
It boots next and is given SW-ID of 221.
Core with BSP HW ID 221 boots next and is given SW-ID of 222.
Core with BSP HW ID 222 boots next and is given SW-ID of 223.
Core with BSP HW ID 223 boots next and is given SW-ID of 224.
/* Find the first allowed core with the shortest run queue */
for (cpu = 0; cpu < num_processors; ++cpu) {
struct cpu_local_var *v;
unsigned long irqstate;
Core with BSP HW ID 225 is 2nd logical core of last physical core.
It boots next and is given SW-ID of 225.
Core with BSP HW ID 226 boots next and is given SW-ID of 226.
Core with BSP HW ID 227 boots next and is given SW-ID of 227.
*/
ihk_spinlock_t cpuid_head_lock = 0;
static int cpuid_head = 0;
if (!CPU_ISSET(cpu, cpu_set)) continue;
/* archtecture-depended syscall handlers */
int obtain_clone_cpuid() {
/* see above on BSP HW ID */
struct ihk_mc_cpu_info *cpu_info = ihk_mc_get_cpu_info();
int cpuid, nretry = 0;
ihk_mc_spinlock_lock_noirq(&cpuid_head_lock);
/* Always start from 0 to fill in LWK cores linearily */
cpuid_head = 0;
retry:
/* Try to obtain next physical core */
cpuid = cpuid_head;
v = get_cpu_local_var(cpu);
irqstate = ihk_mc_spinlock_lock(&v->runq_lock);
if (min_queue_len == -1 || v->runq_len < min_queue_len) {
min_queue_len = v->runq_len;
min_cpu = cpu;
}
ihk_mc_spinlock_unlock(&v->runq_lock, irqstate);
/* A hyper-threading core on the same physical core as
the parent process might be chosen. Use sched_setaffinity
if you want to skip that kind of busy physical core for
performance reason. */
cpuid_head += 1;
if(cpuid_head >= cpu_info->ncpus) {
cpuid_head = 0;
}
if (min_queue_len == 0)
break;
}
/* A hyper-threading core whose parent physical core has a
process on one of its hyper-threading core might
be chosen. Use sched_setaffinity if you want to skip that
kind of busy physical core for performance reason. */
if(get_cpu_local_var(cpuid)->status != CPU_STATUS_IDLE) {
nretry++;
if(nretry >= cpu_info->ncpus) {
cpuid = -1;
ihk_mc_spinlock_unlock_noirq(&cpuid_head_lock);
goto out;
}
goto retry;
}
get_cpu_local_var(cpuid)->status = CPU_STATUS_RESERVED;
ihk_mc_spinlock_unlock_noirq(&cpuid_head_lock);
out:
return cpuid;
if (min_cpu != -1) {
if (get_cpu_local_var(min_cpu)->status != CPU_STATUS_RESERVED)
get_cpu_local_var(min_cpu)->status = CPU_STATUS_RESERVED;
}
return min_cpu;
}
int
@@ -293,7 +259,7 @@ SYSCALL_DECLARE(rt_sigreturn)
extern struct cpu_local_var *clv;
extern unsigned long do_kill(struct thread *thread, int pid, int tid, int sig, struct siginfo *info, int ptracecont);
extern void interrupt_syscall(int pid, int tid);
extern void interrupt_syscall(struct thread *, int sig);
extern int num_processors;
#define RFLAGS_MASK (RFLAGS_CF | RFLAGS_PF | RFLAGS_AF | RFLAGS_ZF | \
@@ -544,14 +510,14 @@ void ptrace_report_signal(struct thread *thread, int sig)
int parent_pid;
struct siginfo info;
dkprintf("ptrace_report_signal,pid=%d\n", thread->proc->pid);
dkprintf("ptrace_report_signal, tid=%d, pid=%d\n", thread->tid, thread->proc->pid);
mcs_rwlock_writer_lock(&proc->update_lock, &lock);
if(!(proc->ptrace & PT_TRACED)){
mcs_rwlock_writer_unlock(&proc->update_lock, &lock);
return;
}
proc->exit_status = sig;
thread->exit_status = sig;
/* Transition thread state */
proc->status = PS_TRACED;
thread->status = PS_TRACED;
@@ -569,8 +535,8 @@ void ptrace_report_signal(struct thread *thread, int sig)
memset(&info, '\0', sizeof info);
info.si_signo = SIGCHLD;
info.si_code = CLD_TRAPPED;
info._sifields._sigchld.si_pid = thread->proc->pid;
info._sifields._sigchld.si_status = thread->proc->exit_status;
info._sifields._sigchld.si_pid = thread->tid;
info._sifields._sigchld.si_status = thread->exit_status;
do_kill(cpu_local_var(current), parent_pid, -1, SIGCHLD, &info, 0);
/* Wake parent (if sleeping in wait4()) */
waitq_wakeup(&proc->parent->waitpid_q);
@@ -695,10 +661,10 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
int orgsig;
int ptraceflag = 0;
struct mcs_rwlock_node_irqsave lock;
unsigned long irqstate;
struct mcs_rwlock_node_irqsave mcs_rw_node;
for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1);
dkprintf("do_signal,pid=%d,sig=%d\n", proc->pid, sig);
dkprintf("do_signal(): tid=%d, pid=%d, sig=%d\n", thread->tid, proc->pid, sig);
orgsig = sig;
if((proc->ptrace & PT_TRACED) &&
@@ -718,12 +684,12 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
rc = regs->gpr.rax;
}
irqstate = ihk_mc_spinlock_lock(&thread->sigcommon->lock);
mcs_rwlock_writer_lock(&thread->sigcommon->lock, &mcs_rw_node);
k = thread->sigcommon->action + sig - 1;
if(k->sa.sa_handler == SIG_IGN){
kfree(pending);
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
return;
}
else if(k->sa.sa_handler){
@@ -808,7 +774,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
if(copy_to_user(sigsp, &ksigsp, sizeof ksigsp)){
kfree(pending);
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
kprintf("do_signal,write_process_vm failed\n");
terminate(0, sig);
return;
@@ -827,7 +793,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
if(!(k->sa.sa_flags & SA_NODEFER))
thread->sigmask.__val[0] |= pending->sigmask.__val[0];
kfree(pending);
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
if(regs->gpr.rflags & RFLAGS_TF){
struct siginfo info;
@@ -853,7 +819,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
}
else
kfree(pending);
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
switch (sig) {
case SIGSTOP:
case SIGTSTP:
@@ -885,7 +851,8 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
/* Wake up the parent who tried wait4 and sleeping */
waitq_wakeup(&proc->parent->waitpid_q);
dkprintf("do_signal,SIGSTOP,sleeping\n");
dkprintf("do_signal(): pid: %d, tid: %d SIGSTOP, sleeping\n",
proc->pid, thread->tid);
/* Sleep */
schedule();
dkprintf("SIGSTOP(): woken up\n");
@@ -899,7 +866,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
/* Update thread state in fork tree */
mcs_rwlock_writer_lock(&proc->update_lock, &lock);
proc->exit_status = SIGTRAP;
thread->exit_status = SIGTRAP;
proc->status = PS_TRACED;
thread->status = PS_TRACED;
mcs_rwlock_writer_unlock(&proc->update_lock, &lock);
@@ -953,11 +920,11 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
static struct sig_pending *
getsigpending(struct thread *thread, int delflag){
struct list_head *head;
ihk_spinlock_t *lock;
mcs_rwlock_lock_t *lock;
struct mcs_rwlock_node_irqsave mcs_rw_node;
struct sig_pending *next;
struct sig_pending *pending;
__sigset_t w;
int irqstate;
__sigset_t x;
int sig;
struct k_sigaction *k;
@@ -966,8 +933,12 @@ getsigpending(struct thread *thread, int delflag){
lock = &thread->sigcommon->lock;
head = &thread->sigcommon->sigpending;
for(;;){
irqstate = ihk_mc_spinlock_lock(lock);
for(;;) {
if (delflag)
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
else
mcs_rwlock_reader_lock(lock, &mcs_rw_node);
list_for_each_entry_safe(pending, next, head, list){
for(x = pending->sigmask.__val[0], sig = 0; x; sig++, x >>= 1);
k = thread->sigcommon->action + sig - 1;
@@ -976,17 +947,26 @@ getsigpending(struct thread *thread, int delflag){
(k->sa.sa_handler != (void *)1 &&
k->sa.sa_handler != NULL)){
if(!(pending->sigmask.__val[0] & w)){
if(delflag)
if(delflag)
list_del(&pending->list);
ihk_mc_spinlock_unlock(lock, irqstate);
if (delflag)
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
else
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
return pending;
}
}
}
ihk_mc_spinlock_unlock(lock, irqstate);
if (delflag)
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
else
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
if(lock == &thread->sigpendinglock)
return NULL;
lock = &thread->sigpendinglock;
head = &thread->sigpending;
}
@@ -1034,22 +1014,25 @@ check_signal(unsigned long rc, void *regs0, int num)
}
}
ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), irqstate);
return;
goto out;
}
if(regs != NULL && !interrupt_from_user(regs)) {
return;
goto out;
}
for(;;){
pending = getsigpending(thread, 1);
if(!pending) {
dkprintf("check_signal,queue is empty\n");
return;
goto out;
}
do_signal(rc, regs, thread, pending, num);
}
out:
return;
}
unsigned long
@@ -1063,7 +1046,8 @@ do_kill(struct thread *thread, int pid, int tid, int sig, siginfo_t *info,
struct thread *tthread = NULL;
int i;
__sigset_t mask;
ihk_spinlock_t *savelock = NULL;
mcs_rwlock_lock_t *savelock = NULL;
struct mcs_rwlock_node mcs_rw_node;
struct list_head *head = NULL;
int rc;
unsigned long irqstate = 0;
@@ -1195,7 +1179,8 @@ done:
if(pid != -1 && tthread->proc->pid != pid){
continue;
}
if(tthread->tid == tid){
if (tthread->tid == tid &&
tthread->status != PS_EXITED) {
found = 1;
break;
}
@@ -1245,9 +1230,15 @@ done:
return 0;
}
if (tthread->thread_offloaded) {
interrupt_syscall(tthread, sig);
release_thread(tthread);
return 0;
}
doint = 0;
ihk_mc_spinlock_lock_noirq(savelock);
mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node);
/* Put signal event even when handler is SIG_IGN or SIG_DFL
because target ptraced thread must call ptrace_report_signal
@@ -1286,12 +1277,10 @@ done:
}
}
}
ihk_mc_spinlock_unlock_noirq(savelock);
mcs_rwlock_writer_unlock_noirq(savelock, &mcs_rw_node);
cpu_restore_interrupt(irqstate);
if (doint && !(mask & tthread->sigmask.__val[0])) {
int tid = tthread->tid;
int pid = tproc->pid;
int status = tthread->status;
if (thread != tthread) {
@@ -1301,7 +1290,7 @@ done:
}
if(!tthread->proc->nohost)
interrupt_syscall(pid, tid);
interrupt_syscall(tthread, 0);
if (status != PS_RUNNING) {
if(sig == SIGKILL){
@@ -1313,6 +1302,9 @@ done:
sched_wakeup_thread(tthread, PS_STOPPED);
tthread->proc->status = PS_RUNNING;
}
else {
sched_wakeup_thread(tthread, PS_INTERRUPTIBLE);
}
}
}
release_thread(tthread);
@@ -1559,7 +1551,7 @@ static int vdso_get_vdso_info(void)
{
int error;
struct ikc_scd_packet packet;
struct ihk_ikc_channel_desc *ch = cpu_local_var(syscall_channel);
struct ihk_ikc_channel_desc *ch = cpu_local_var(ikc2linux);
dkprintf("vdso_get_vdso_info()\n");
memset(&vdso, '\0', sizeof vdso);
@@ -1757,7 +1749,8 @@ int arch_map_vdso(struct process_vm *vm)
vrflags = VR_REMOTE;
vrflags |= VR_PROT_READ | VR_PROT_EXEC;
vrflags |= VRFLAG_PROT_TO_MAXPROT(vrflags);
error = add_process_memory_range(vm, (intptr_t)s, (intptr_t)e, NOPHYS, vrflags, NULL, 0, PAGE_SHIFT);
error = add_process_memory_range(vm, (intptr_t)s, (intptr_t)e,
NOPHYS, vrflags, NULL, 0, PAGE_SHIFT, NULL);
if (error) {
ekprintf("ERROR: adding memory range for vdso. %d\n", error);
goto out;
@@ -1788,7 +1781,8 @@ int arch_map_vdso(struct process_vm *vm)
vrflags = VR_REMOTE;
vrflags |= VR_PROT_READ;
vrflags |= VRFLAG_PROT_TO_MAXPROT(vrflags);
error = add_process_memory_range(vm, (intptr_t)s, (intptr_t)e, NOPHYS, vrflags, NULL, 0, PAGE_SHIFT);
error = add_process_memory_range(vm, (intptr_t)s, (intptr_t)e,
NOPHYS, vrflags, NULL, 0, PAGE_SHIFT, NULL);
if (error) {
ekprintf("ERROR: adding memory range for vvar. %d\n", error);
goto out;
@@ -1836,4 +1830,61 @@ out:
return error;
} /* arch_map_vdso() */
void
save_uctx(void *uctx, struct x86_user_context *regs)
{
struct trans_uctx {
volatile int cond;
int fregsize;
unsigned long rax;
unsigned long rbx;
unsigned long rcx;
unsigned long rdx;
unsigned long rsi;
unsigned long rdi;
unsigned long rbp;
unsigned long r8;
unsigned long r9;
unsigned long r10;
unsigned long r11;
unsigned long r12;
unsigned long r13;
unsigned long r14;
unsigned long r15;
unsigned long rflags;
unsigned long rip;
unsigned long rsp;
unsigned long fs;
} *ctx = uctx;
if (!regs) {
asm ("movq %%gs:(%1),%0" : "=r"(regs) :
"r"(offsetof(struct x86_cpu_local_variables, tss.rsp0)));
regs--;
}
ctx->cond = 0;
ctx->rax = regs->gpr.rax;
ctx->rbx = regs->gpr.rbx;
ctx->rcx = regs->gpr.rcx;
ctx->rdx = regs->gpr.rdx;
ctx->rsi = regs->gpr.rsi;
ctx->rdi = regs->gpr.rdi;
ctx->rbp = regs->gpr.rbp;
ctx->r8 = regs->gpr.r8;
ctx->r9 = regs->gpr.r9;
ctx->r10 = regs->gpr.r10;
ctx->r11 = regs->gpr.r11;
ctx->r12 = regs->gpr.r12;
ctx->r13 = regs->gpr.r13;
ctx->r14 = regs->gpr.r14;
ctx->r15 = regs->gpr.r15;
ctx->rflags = regs->gpr.rflags;
ctx->rsp = regs->gpr.rsp;
ctx->rip = regs->gpr.rip;
ihk_mc_arch_get_special_register(IHK_ASR_X86_FS, &ctx->fs);
ctx->fregsize = 0;
}
/*** End of File ***/

View File

@@ -17,6 +17,7 @@
* make sure that these are position-independent codes.
*/
#include <cls.h>
#include <syscall.h>
#include <ihk/atomic.h>
#include <arch/cpu.h>

View File

@@ -0,0 +1,67 @@
#!/usr/bin/expect
set INST_DIR "@prefix@"
spawn $INST_DIR/bin/eclair -d /tmp/mckernel.dump -k $INST_DIR/smp-x86/kernel/mckernel.img -i
set state "init"
set thread_id 0
expect {
"in ?? ()" {
switch -- $state {
"thread_chosen" {
set state "thread_skip"
}
"thread_bt" {
set state "thread_skip"
}
}
exp_continue
}
"(eclair) " {
switch -- $state {
"init" {
set state "threads_list"
send "info threads\r"
}
"threads_list" {
incr thread_id
set state "thread_chosen"
send "thread $thread_id\r"
}
"thread_skip" {
incr thread_id
set state "thread_chosen"
send "thread $thread_id\r"
}
"thread_chosen" {
set state "thread_bt"
send "bt\r"
}
}
exp_continue
}
"Type <return> to continue, or q <return> to quit" {
switch -- $state {
"threads_list" {
send "\r"
}
"thread_bt" {
send "\r"
}
"thread_skip" {
send "q\r"
}
}
exp_continue
}
" not known." {
expect "(eclair) " { send "quit\r" }
expect "Quit anyway? (y or n) " { send "y\r" }
exit 0
}
}

View File

@@ -3,7 +3,7 @@ Description=irqbalance daemon
After=syslog.target
[Service]
EnvironmentFile=@ETCDIR@/irqbalance_mck
EnvironmentFile=/tmp/irqbalance_mck
ExecStart=/usr/sbin/irqbalance --foreground $IRQBALANCE_ARGS
[Install]

View File

@@ -3,13 +3,13 @@
# IHK SMP-x86 example boot script.
# author: Balazs Gerofi <bgerofi@riken.jp>
# Copyright (C) 2014 RIKEN AICS
#
# This is an example script for loading IHK, configuring a partition and
# booting McKernel on it.
# The script reserves half of the CPU cores and 512MB of RAM from NUMA node 0
# when IHK is loaded for the first time, otherwise it destroys the current
# McKernel instance and reboots it using the same set of resources as it used
# previously.
#
# This is an example script for loading IHK, configuring a partition and
# booting McKernel on it. Unless specific CPUs and memory are requested,
# the script reserves half of the CPU cores and 512MB of RAM from
# NUMA node 0 when IHK is loaded for the first time.
# Otherwise, it destroys the current McKernel instance and reboots it using
# the same set of resources as it used previously.
# Note that the script does not output anything unless an error occurs.
prefix="@prefix@"
@@ -22,6 +22,12 @@ ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@"
mem="512M@0"
cpus=""
ikc_map=""
if [ "${BASH_VERSINFO[0]}" -lt 4 ]; then
echo "You need at least bash-4.0 to run this script." >&2
exit 1
fi
INTERVAL=1
LOGMODE=0
@@ -29,12 +35,15 @@ facility="LOG_LOCAL6"
chown_option=`logname 2> /dev/null`
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" -o "`systemctl status irqbalance.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
irqbalance_used="yes"
irqbalance_used="yes"
else
irqbalance_used="no"
irqbalance_used="no"
fi
while getopts :i:k:c:m:o:f: OPT
turbo=""
ihk_irq=""
while getopts :ti:k:c:m:o:f:r:q: OPT
do
case ${OPT} in
f) facility=${OPTARG}
@@ -71,11 +80,114 @@ do
;;
m) mem=${OPTARG}
;;
r) ikc_map=${OPTARG}
;;
q) ihk_irq=${OPTARG}
;;
t) turbo="turbo"
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
#
# Revert any state that has been initialized before the error occured.
#
error_exit() {
local status=$1
case $status in
mcos_sys_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/mcos0_sys
fi
;&
mcos_proc_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/mcos0_proc
fi
;&
mcoverlayfs_loaded)
if [ "$enable_mcoverlay" == "yes" ]; then
rmmod mcoverlay 2>/dev/null
fi
;&
linux_proc_bind_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/linux_proc
fi
;&
tmp_mcos_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos
fi
;&
tmp_mcos_created)
if [ "$enable_mcoverlay" == "yes" ]; then
rm -rf /tmp/mcos
fi
;&
os_created)
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "warning: failed to destroy LWK instance $ind" >&2
fi
done
fi
;&
mcctrl_loaded)
rmmod mcctrl 2>/dev/null || echo "warning: failed to remove mcctrl" >&2
;&
cpus_reserved)
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if [ "${cpus}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then
echo "warning: failed to release CPUs" >&2
fi
fi
;&
mem_reserved)
mem=`${SBINDIR}/ihkconfig 0 query mem`
if [ "${mem}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
echo "warning: failed to release memory" >&2
fi
fi
;&
ihk_smp_loaded)
rmmod ihk_smp_x86 2>/dev/null || echo "warning: failed to remove ihk_smp_x86" >&2
;&
ihk_loaded)
rmmod ihk 2>/dev/null || echo "warning: failed to remove ihk" >&2
;&
irqbalance_stopped)
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
echo "warning: failed to stop irqbalance_mck" >&2
fi
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "warning: failed to disable irqbalance_mck" >&2
fi
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
fi
if ! systemctl start irqbalance.service; then
echo "warning: failed to start irqbalance" >&2;
fi
fi
;&
initial)
# Nothing more to revert
;;
esac
exit 1
}
ihk_ikc_irq_core=0
release=`uname -r`
@@ -84,13 +196,20 @@ minor=`echo ${release} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/'`
patch=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/'`
linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}`
rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'`
if [ "${release}" == "${rhel_release}" ]; then rhel_release=""; fi
if [ "${release}" == "${rhel_release}" ]; then
rhel_release="";
fi
enable_mcoverlay="no"
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
if [ "${rhel_release}" == "" ]; then
if [ ${linux_version_code} -ge 262144 -a ${linux_version_code} -lt 262400 ]; then
enable_mcoverlay="yes"
fi
if [ ${linux_version_code} -ge 263680 -a ${linux_version_code} -lt 263936 ]; then
enable_mcoverlay="yes"
fi
else
if [ ${linux_version_code} -eq 199168 -a ${rhel_release} -ge 327 ]; then
enable_mcoverlay="yes"
@@ -98,6 +217,7 @@ if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
fi
fi
# Figure out CPUs if not requested by user
if [ "$cpus" == "" ]; then
# Get the number of CPUs on NUMA node 0
nr_cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $4}' | wc -l`
@@ -105,146 +225,318 @@ if [ "$cpus" == "" ]; then
# Use the second half of the cores
let nr_cpus="$nr_cpus / 2"
cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $1}' | tail -n $nr_cpus | xargs echo -n | sed 's/ /,/g'`
if [ "$cpus" == "" ]; then echo "error: no available CPUs on NUMA node 0?" >&2; exit 1; fi
if [ "$cpus" == "" ]; then
echo "error: no available CPUs on NUMA node 0?" >&2
exit 1
fi
fi
# Remove mcoverlay if loaded
if [ "$enable_mcoverlay" == "yes" ]; then
if [ "`lsmod | grep mcoverlay`" != "" ]; then
if grep mcoverlay /proc/modules &>/dev/null; then
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then echo "error: removing mcoverlay" >&2; exit 1; fi
if ! rmmod mcoverlay 2>/dev/null; then
echo "error: removing mcoverlay" >&2
exit 1
fi
fi
fi
# Stop irqbalance
if [ "${irqbalance_used}" == "yes" ]; then
systemctl stop irqbalance_mck.service 2>/dev/null
if ! systemctl stop irqbalance.service 2>/dev/null ; then echo "error: stopping irqbalance" >&2; exit 1; fi;
if ! systemctl stop irqbalance.service 2>/dev/null ; then
echo "error: stopping irqbalance" >&2
exit 1
fi;
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }'; then
echo "error: saving /proc/irq/*/smp_affinity" >&2
error_exit "mcos_sys_mounted"
fi;
# Prevent /proc/irq/*/smp_affinity from getting zero after offlining
# McKernel CPUs by using the following algorithm.
# if (smp_affinity & mck_cores) {
# smp_affinity = (mck_cores ^ -1);
# }
ncpus=`lscpu | grep -E '^CPU\(s\):' | awk '{print $2}'`
smp_affinity_mask=`echo $cpus | ncpus=$ncpus perl -e 'while(<>){@tokens = split /,/;foreach $token (@tokens) {@nums = split /-/,$token; for($num = $nums[0]; $num <= $nums[$#nums]; $num++) {$ndx=int($num/32); $mask[$ndx] |= (1<<($num % 32))}}} $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if($j != $nint32s - 1){print ",";} $nblks = ($j != $nint32s - 1) ? 8 : ($ENV{'ncpus'} % 32 != 0) ? int((($ENV{'ncpus'} + 3) % 32) / 4) : 8; for($i = $nblks - 1;$i >= 0;$i--){ printf("%01x",($mask[$j] >> ($i*4)) & 0xf);}}'`
# echo cpus=$cpus ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); if($len != 0) { $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then
echo "error: modifying /proc/irq/*/smp_affinity" >&2
error_exit "mcos_sys_mounted"
fi
fi
# Load IHK if not loaded
if [ "`lsmod | grep ihk`" == "" ]; then
if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk" >&2; exit 1; fi;
if ! grep -E 'ihk\s' /proc/modules &>/dev/null; then
if ! taskset -c 0 insmod ${KMODDIR}/ihk.ko 2>/dev/null; then
echo "error: loading ihk" >&2
error_exit "irqbalance_stopped"
fi
fi
# Load IHK-SMP if not loaded and reserve CPUs and memory
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then
ihk_irq=""
for i in `seq 64 255`; do
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
ihk_irq=$i
break
fi
done
if [ "$ihk_irq" == "" ]; then echo "error: no IRQ available" >&2; exit 1; fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then echo "error: loading ihk-smp-x86" >&2; exit 1; fi;
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi
# If loaded, but no resources allocated, get CPUs and memory
else
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus_allocated=`${SBINDIR}/ihkosctl 0 query cpu`
if [ "$cpus_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi
fi
# Increase swappiness so that we have better chance to allocate memory for IHK
echo 100 > /proc/sys/vm/swappiness
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem_allocated=`${SBINDIR}/ihkosctl 0 query mem`
if [ "$mem_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi
# Drop Linux caches to free memory
sync && echo 3 > /proc/sys/vm/drop_caches
# Merge free memory areas into large, physically contigous ones
echo 1 > /proc/sys/vm/compact_memory 2>/dev/null
sync
# Load IHK-SMP if not loaded and reserve CPUs and memory
if ! grep ihk_smp_x86 /proc/modules &>/dev/null; then
if [ "$ihk_irq" == "" ]; then
for i in `seq 64 255`; do
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
ihk_irq=$i
break
fi
done
if [ "$ihk_irq" == "" ]; then
echo "error: no IRQ available" >&2
error_exit "ihk_loaded"
fi
fi
if ! taskset -c 0 insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core 2>/dev/null; then
echo "error: loading ihk-smp-x86" >&2
error_exit "ihk_loaded"
fi
# Offline-reonline RAM (special case for OFP SNC-4 mode)
if [ "`hostname | grep "c[0-9][0-9][0-9][0-9].ofp"`" != "" ] && [ "`cat /sys/devices/system/node/online`" == "0-7" ]; then
for i in 0 1 2 3; do
find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
echo 0 > $f 2>&1 > /dev/null;
done
find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
echo 1 > $f 2>&1 > /dev/null;
done
done
for i in 4 5 6 7; do
find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
echo 0 > $f 2>&1 > /dev/null;
done
find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
echo 1 > $f 2>&1 > /dev/null;
done
done
fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then
echo "error: reserving memory" >&2
error_exit "ihk_smp_loaded"
fi
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then
echo "error: reserving CPUs" >&2;
error_exit "mem_reserved"
fi
fi
# Load mcctrl if not loaded
if [ "`lsmod | grep mcctrl`" == "" ]; then
if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko" >&2; exit 1; fi
if ! grep mcctrl /proc/modules &>/dev/null; then
if ! taskset -c 0 insmod ${KMODDIR}/mcctrl.ko 2>/dev/null; then
echo "error: inserting mcctrl.ko" >&2
error_exit "cpus_reserved"
fi
fi
# Check for existing OS instance and destroy
if [ -c /dev/mcos0 ]; then
# Query CPU cores and memory of OS instance so that the same values are used as previously
if ! ${SBINDIR}/ihkosctl 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkosctl 0 query cpu`
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkosctl 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 destroy 0; then echo "warning: destroy failed" >&2; fi
else
# Otherwise query IHK-SMP for resources
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "error: destroying LWK instance $ind failed" >&2
error_exit "mcctrl_loaded"
fi
done
fi
if ! ${SBINDIR}/ihkconfig 0 create; then echo "error: create" >&2; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE}"; then echo "error: setting kernel arguments" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting" >&2; exit 1; fi
if ! chown ${chown_option} /dev/mcd* /dev/mcos*; then echo "error: chowning device files" >&2; exit 1; fi
# Create OS instance
if ! ${SBINDIR}/ihkconfig 0 create; then
echo "error: creating OS instance" >&2
error_exit "mcctrl_loaded"
fi
# Assign CPUs
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then
echo "error: assign CPUs" >&2
error_exit "os_created"
fi
if [ "$ikc_map" != "" ]; then
# Specify IKC map
if ! ${SBINDIR}/ihkosctl 0 ikc_map ${ikc_map}; then
echo "error: assign CPUs" >&2
error_exit "os_created"
fi
fi
# Assign memory
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then
echo "error: assign memory" >&2
error_exit "os_created"
fi
# Load kernel image
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then
echo "error: loading kernel image: ${KERNDIR}/mckernel.img" >&2
error_exit "os_created"
fi
# Set kernel arguments
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE} $turbo"; then
echo "error: setting kernel arguments" >&2
error_exit "os_created"
fi
# Boot OS instance
if ! ${SBINDIR}/ihkosctl 0 boot; then
echo "error: booting" >&2
error_exit "os_created"
fi
# Set device file ownership
if ! chown ${chown_option} /dev/mcd* /dev/mcos*; then
echo "warning: failed to chown device files" >&2
fi
# Overlay /proc, /sys with McKernel specific contents
if [ "$enable_mcoverlay" == "yes" ]; then
if [ ! -e /tmp/mcos ]; then mkdir -p /tmp/mcos; fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then echo "error: mount /tmp/mcos" >&2; exit 1; fi
if [ ! -e /tmp/mcos/linux_proc ]; then mkdir -p /tmp/mcos/linux_proc; fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then echo "error: mount /tmp/mcos/linux_proc" >&2; exit 1; fi
if ! insmod ${KMODDIR}/mcoverlay.ko; then echo "error: inserting mcoverlay.ko" >&2; exit 1; fi
if [ ! -e /tmp/mcos ]; then
mkdir -p /tmp/mcos;
fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then
echo "error: mount /tmp/mcos" >&2
error_exit "tmp_mcos_created"
fi
if [ ! -e /tmp/mcos/linux_proc ]; then
mkdir -p /tmp/mcos/linux_proc;
fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then
echo "error: mount /tmp/mcos/linux_proc" >&2
error_exit "tmp_mcos_mounted"
fi
if ! taskset -c 0 insmod ${KMODDIR}/mcoverlay.ko 2>/dev/null; then
echo "error: inserting mcoverlay.ko" >&2
error_exit "linux_proc_bind_mounted"
fi
while [ ! -e /proc/mcos0 ]
do
sleep 1
sleep 0.1
done
if [ ! -e /tmp/mcos/mcos0_proc ]; then mkdir -p /tmp/mcos/mcos0_proc; fi
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then mkdir -p /tmp/mcos/mcos0_proc_upper; fi
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then mkdir -p /tmp/mcos/mcos0_proc_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then echo "error: mount /tmp/mcos/mcos0_proc" >&2; exit 1; fi
if [ ! -e /tmp/mcos/mcos0_proc ]; then
mkdir -p /tmp/mcos/mcos0_proc;
fi
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then
mkdir -p /tmp/mcos/mcos0_proc_upper;
fi
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then
mkdir -p /tmp/mcos/mcos0_proc_work;
fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then
echo "error: mounting /tmp/mcos/mcos0_proc" >&2
error_exit "mcoverlayfs_loaded"
fi
# TODO: How de we revert this in case of failure??
mount --make-rprivate /proc
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys ]
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys/setup_complete ]
do
sleep 1
sleep 0.1
done
if [ ! -e /tmp/mcos/mcos0_sys ]; then mkdir -p /tmp/mcos/mcos0_sys; fi
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then mkdir -p /tmp/mcos/mcos0_sys_upper; fi
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then mkdir -p /tmp/mcos/mcos0_sys_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then echo "error: mount /tmp/mcos/mcos0_sys" >&2; exit 1; fi
if [ ! -e /tmp/mcos/mcos0_sys ]; then
mkdir -p /tmp/mcos/mcos0_sys;
fi
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then
mkdir -p /tmp/mcos/mcos0_sys_upper;
fi
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then
mkdir -p /tmp/mcos/mcos0_sys_work;
fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then
echo "error: mount /tmp/mcos/mcos0_sys" >&2
error_exit "mcos_proc_mounted"
fi
# TODO: How de we revert this in case of failure??
mount --make-rprivate /sys
touch /tmp/mcos/mcos0_proc/mckernel
rm -rf /tmp/mcos/mcos0_sys/setup_complete
# Hide NUMA related files which are outside the LWK partition
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/drivers/processor/$cpuid
else
for nodeid in `find /sys/devices/system/cpu/$cpuid/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid/$nodeid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid/$nodeid
fi
done
fi
done
for nodeid in `find /sys/devices/system/node/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/*
rm -rf /tmp/mcos/mcos0_sys/bus/node/devices/$nodeid
else
# Delete non-existent symlinks
for cpuid in `find /sys/devices/system/node/$nodeid/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid/$cpuid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/$cpuid
fi
done
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/memory*
fi
done
rm -f /tmp/mcos/mcos0_sys/devices/system/node/has_*
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
fi
done
fi
if [ ${LOGMODE} -ne 0 ]
then
# mcklogd survives when McKernel isn't shut down by mcstop+release.sh
pkill mcklogd
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility}
fi
# Start irqbalance with CPUs and IRQ for McKernel banned
if [ "${irqbalance_used}" == "yes" ]; then
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }' ; then echo "error: saving /proc/irq/*/smp_affinity" >&2; exit 1; fi;
ncpus=`lscpu | grep -E '^CPU\(s\):' | awk '{print $2}'`
smp_affinity_mask=`echo $cpus | ncpus=$ncpus perl -e 'while(<>){@tokens = split /,/;foreach $token (@tokens) {@nums = split /-/,$token; for($num = $nums[0]; $num <= $nums[$#nums]; $num++) {$ndx=int($num/32); $mask[$ndx] |= (1<<($num % 32))}}} $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if($j != $nint32s - 1){print ",";} $nblks = $j == $nint32s - 1 ? int(($ENV{'ncpus'} % 32)/4) : 8; for($i = $nblks - 1;$i >= 0;$i--){ printf("%01x",($mask[$j] >> ($i*4)) & 0xf);}}'`
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then echo "error: modifying /proc/irq/*/smp_affinity" >&2; exit 1; fi;
banirq=`cat /proc/interrupts| perl -e 'while(<>) { if(/^\s*(\d+).*IHK\-SMP\s*$/) {print $1;}}'`
sed "s/%mask%/$smp_affinity_mask/g" $ETCDIR/irqbalance_mck.in | sed "s/%banirq%/$banirq/g" > $ETCDIR/irqbalance_mck
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then echo "error: linking irqbalance_mck" >&2; exit 1; fi;
if ! systemctl start irqbalance_mck.service 2>/dev/null ; then echo "error: starting irqbalance_mck" >&2; exit 1; fi;
# echo cpus=$cpus mask=$smp_affinity_mask banirq=$banirq
sed "s/%mask%/$smp_affinity_mask/g" $ETCDIR/irqbalance_mck.in | sed "s/%banirq%/$banirq/g" > /tmp/irqbalance_mck
systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "error: linking irqbalance_mck" >&2
error_exit "mcos_sys_mounted"
fi
if ! systemctl start irqbalance_mck.service 2>/dev/null ; then
echo "error: starting irqbalance_mck" >&2
error_exit "mcos_sys_mounted"
fi
# echo cpus=$cpus ncpus=$ncpus banirq=$banirq
fi
# Start mcklogd. Note that McKernel blocks when kmsg buffer is full
# with '-k 1' until mcklogd unblocks it so starting mcklogd must preceed
# booting McKernel
if [ ${LOGMODE} -ne 0 ]; then
# Stop mcklogd which has survived McKernel shutdown because
# mcstop+release.sh is not used
pkill mcklogd
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility}
fi

View File

@@ -16,49 +16,110 @@ KERNDIR="@KERNDIR@"
mem=""
cpus=""
irqbalance_used=""
# No SMP module? Exit.
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit 0; fi
if ! grep ihk_smp_x86 /proc/modules &>/dev/null; then exit 0; fi
# Stop mcklogd
while pgrep "mcklogd" > /dev/null 2>&1;
do
pkill -9 mcklogd
done
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
irqbalance_used="yes"
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
echo "warning: failed to stop irqbalance_mck" >&2
fi
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "warning: failed to disable irqbalance_mck" >&2
fi
fi
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then echo "error: destroying LWK instance $ind failed" >&2; exit 1; fi
done
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "error: destroying LWK instance $ind failed" >&2
exit 1
fi
done
fi
# Query IHK-SMP resources and release them
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then echo "error: releasing CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then
echo "error: querying cpus" >&2
exit 1
fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then echo "error: releasing memory" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if [ "${cpus}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then
echo "error: releasing CPUs" >&2
exit 1
fi
fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then
echo "error: querying memory" >&2
exit 1
fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if [ "${mem}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
echo "error: releasing memory" >&2
exit 1
fi
fi
# Remove delegator if loaded
if [ "`lsmod | grep mcctrl`" != "" ]; then
if ! rmmod mcctrl; then echo "error: removing mcctrl" >&2; exit 1; fi
if grep mcctrl /proc/modules &>/dev/null; then
if ! rmmod mcctrl 2>/dev/null; then
echo "error: removing mcctrl" >&2
exit 1
fi
fi
# Remove mcoverlay if loaded
if grep mcoverlay /proc/modules &>/dev/null; then
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay 2>/dev/null; then
echo "warning: failed to remove mcoverlay" >&2
fi
fi
# Remove SMP module
if [ "`lsmod | grep ihk_smp_x86`" != "" ]; then
if ! rmmod ihk_smp_x86; then echo "error: removing ihk_smp_x86" >&2; exit 1; fi
if grep ihk_smp_x86 /proc/modules &>/dev/null; then
if ! rmmod ihk_smp_x86 2>/dev/null; then
echo "error: removing ihk_smp_x86" >&2
exit 1
fi
fi
# Remove core module
if [ "`lsmod | grep -E 'ihk\s' | awk '{print $1}'`" != "" ]; then
if ! rmmod ihk; then echo "error: removing ihk" >&2; exit 1; fi
if grep -E 'ihk\s' /proc/modules &>/dev/null; then
if ! rmmod ihk 2>/dev/null; then
echo "error: removing ihk" >&2
exit 1
fi
fi
# Stop mcklogd
pkill mcklogd
# Start irqbalance with the original settings
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null ; then echo "error: stopping irqbalance_mck" >&2; exit 1; fi;
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then echo "error: disabling irqbalance_mck" >&2; exit 1; fi;
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }' ; then echo "error: restoring /proc/irq/*/smp_affinity" >&2; exit 1; fi;
if ! systemctl start irqbalance.service; then echo "error: starting irqbalance" >&2; exit 1; fi;
if [ "${irqbalance_used}" != "" ]; then
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
fi
if ! systemctl start irqbalance.service; then
echo "warning: failed to start irqbalance" >&2;
fi
fi
# Set back default swappiness
echo 60 > /proc/sys/vm/swappiness

View File

@@ -1,4 +1,4 @@
/* executer/config.h.in. Generated from configure.ac by autoheader. */
/* config.h.in. Generated from configure.ac by autoheader. */
/* whether mcoverlayfs is enabled */
#undef ENABLE_MCOVERLAYFS
@@ -6,6 +6,9 @@
/* whether memdump feature is enabled */
#undef ENABLE_MEMDUMP
/* whether rusage is enabled */
#undef ENABLE_RUSAGE
/* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H
@@ -51,6 +54,9 @@
/* Define to address of kernel symbol sys_readlink, or 0 if exported */
#undef MCCTRL_KSYM_sys_readlink
/* Define to address of kernel symbol sys_umount, or 0 if exported */
#undef MCCTRL_KSYM_sys_umount
/* Define to address of kernel symbol sys_unshare, or 0 if exported */
#undef MCCTRL_KSYM_sys_unshare
@@ -69,6 +75,9 @@
/* Define to address of kernel symbol zap_page_range, or 0 if exported */
#undef MCCTRL_KSYM_zap_page_range
/* McKernel specific libraries */
#undef MCKERNEL_LIBDIR
/* Define to the address where bug reports for this package should be sent. */
#undef PACKAGE_BUGREPORT
@@ -87,5 +96,8 @@
/* Define to the version of this package. */
#undef PACKAGE_VERSION
/* Path of bind-mount source directory */
#undef ROOTFSDIR
/* Define to 1 if you have the ANSI C header files. */
#undef STDC_HEADERS

1963
configure vendored

File diff suppressed because it is too large Load Diff

View File

@@ -17,6 +17,13 @@ DCFA_RELEASE_DATE=DCFA_RELEASE_DATE_m4
AC_PREFIX_DEFAULT([/opt/ppos])
AC_CHECK_HEADER([numa.h],[numa_header_found=yes])
AS_IF([test "x$numa_header_found" != "xyes"],
[AC_MSG_ERROR([Unable to find numa.h header file, missing numactl-devel?])])
AC_CHECK_LIB([numa],[numa_run_on_node],[numa_lib_found=yes])
AS_IF([test "x$numa_lib_found" != "xyes"],
[AC_MSG_ERROR([Unable to find NUMA library, missing numactl-devel?])])
AC_ARG_WITH([kernelsrc],
AC_HELP_STRING(
[--with-kernelsrc=path],[Path to 'kernel src', default is /lib/modules/uname_r/build]),
@@ -48,6 +55,23 @@ AC_ARG_ENABLE([mcoverlayfs],
[ENABLE_MCOVERLAYFS=$enableval],
[ENABLE_MCOVERLAYFS=yes])
AC_ARG_ENABLE([rusage],
AC_HELP_STRING([--enable-rusage],
[enable rusage implementation]),
[ENABLE_RUSAGE=$enableval],
[ENABLE_RUSAGE=yes])
AC_ARG_WITH([uname_r],
AC_HELP_STRING(
[--with-uname_r=uname_r],[Value of '`uname -r`' on the target platform, default is local value]),
[WITH_UNAME_R=$withval],[WITH_UNAME_R=yes])
case "X$WITH_UNAME_R" in
Xyes | Xno | X)
WITH_UNAME_R='`uname -r`'
;;
esac
case "X$WITH_KERNELSRC" in
Xyes | Xno | X)
WITH_KERNELSRC='/lib/modules/`uname -r`/build'
@@ -64,12 +88,14 @@ if test "X$WITH_TARGET" = Xyes -o "X$WITH_TARGET" = Xno; then
fi
test "x$prefix" = xNONE && prefix="$ac_default_prefix"
AC_DEFINE_UNQUOTED(ROOTFSDIR,"$prefix/rootfs",[Path of bind-mount source directory])
case $WITH_TARGET in
attached-mic|builtin-x86|smp-x86)
ARCH=`uname -m`
AC_PROG_CC
XCC=$CC
CFLAGS="$CFLAGS -ffreestanding -fno-tree-loop-distribute-patterns"
;;
builtin-mic)
ARCH=k1om
@@ -146,6 +172,9 @@ case $WITH_TARGET in
if test "X$SBINDIR" = X; then
SBINDIR="$prefix/sbin"
fi
if test "X$MCKERNEL_LIBDIR" = X; then
MCKERNEL_LIBDIR="$prefix/lib"
fi
if test "X$ETCDIR" = X; then
ETCDIR="$prefix/etc"
fi
@@ -162,6 +191,7 @@ case $WITH_TARGET in
esac
KDIR="$WITH_KERNELSRC"
UNAME_R="$WITH_UNAME_R"
TARGET="$WITH_TARGET"
MCCTRL_LINUX_SYMTAB=""
@@ -221,6 +251,7 @@ AC_DEFUN([MCCTRL_FIND_KSYM],[
])
MCCTRL_FIND_KSYM([sys_mount])
MCCTRL_FIND_KSYM([sys_umount])
MCCTRL_FIND_KSYM([sys_unshare])
MCCTRL_FIND_KSYM([zap_page_range])
MCCTRL_FIND_KSYM([vdso_image_64])
@@ -274,18 +305,44 @@ else
AC_MSG_NOTICE([mcoverlayfs is disabled])
fi
case $ENABLE_RUSAGE in
yes|no)
;;
default)
ENABLE_RUSAGE=yes
;;
*)
AC_MSG_ERROR([unknown rusage argument: $ENABLE_RUSAGE])
;;
esac
if test "x$ENABLE_RUSAGE" = "xyes" ; then
AC_MSG_NOTICE([rusage is enabled])
AC_DEFINE([ENABLE_RUSAGE],[1],[whether rusage is enabled])
else
AC_MSG_NOTICE([rusage is disabled])
fi
if test "x$MCKERNEL_LIBDIR" != "x" ; then
AC_DEFINE_UNQUOTED(MCKERNEL_LIBDIR,"$MCKERNEL_LIBDIR",[McKernel specific libraries])
fi
AC_SUBST(CC)
AC_SUBST(XCC)
AC_SUBST(ARCH)
AC_SUBST(KDIR)
AC_SUBST(UNAME_R)
AC_SUBST(TARGET)
AC_SUBST(BINDIR)
AC_SUBST(SBINDIR)
AC_SUBST(MCKERNEL_LIBDIR)
AC_SUBST(ETCDIR)
AC_SUBST(KMODDIR)
AC_SUBST(KERNDIR)
AC_SUBST(MANDIR)
AC_SUBST(CFLAGS)
AC_SUBST(ENABLE_MCOVERLAYFS)
AC_SUBST(ENABLE_RUSAGE)
AC_SUBST(IHK_VERSION)
AC_SUBST(MCKERNEL_VERSION)
@@ -295,15 +352,17 @@ AC_SUBST(MCKERNEL_RELEASE_DATE)
AC_SUBST(DCFA_RESEASE_DATE)
AC_SUBST(uncomment_if_ENABLE_MEMDUMP)
AC_CONFIG_HEADERS([executer/config.h])
AC_CONFIG_HEADERS([config.h])
AC_CONFIG_FILES([
Makefile
executer/user/Makefile
executer/user/arch/x86_64/Makefile
executer/kernel/mcctrl/Makefile
executer/kernel/mcctrl/arch/x86_64/Makefile
executer/kernel/mcoverlayfs/Makefile
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile
executer/kernel/mcoverlayfs/linux-4.0.9/Makefile
executer/kernel/mcoverlayfs/linux-4.6.7/Makefile
kernel/Makefile
kernel/Makefile.build
arch/x86/tools/mcreboot-attached-mic.sh
@@ -311,6 +370,7 @@ AC_CONFIG_FILES([
arch/x86/tools/mcreboot-builtin-x86.sh
arch/x86/tools/mcreboot-smp-x86.sh
arch/x86/tools/mcstop+release-smp-x86.sh
arch/x86/tools/eclair-dump-backtrace.exp
arch/x86/tools/mcshutdown-builtin-x86.sh
arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in
arch/x86/tools/irqbalance_mck.service

View File

@@ -41,6 +41,9 @@
#define MCEXEC_UP_NEW_PROCESS 0x30a02909
#define MCEXEC_UP_GET_CRED 0x30a0290a
#define MCEXEC_UP_GET_CREDV 0x30a0290b
#define MCEXEC_UP_GET_NODES 0x30a0290c
#define MCEXEC_UP_GET_CPUSET 0x30a0290d
#define MCEXEC_UP_CREATE_PPD 0x30a0290e
#define MCEXEC_UP_PREPARE_DMA 0x30a02910
#define MCEXEC_UP_FREE_DMA 0x30a02911
@@ -49,7 +52,18 @@
#define MCEXEC_UP_CLOSE_EXEC 0x30a02913
#define MCEXEC_UP_SYS_MOUNT 0x30a02914
#define MCEXEC_UP_SYS_UNSHARE 0x30a02915
#define MCEXEC_UP_SYS_UMOUNT 0x30a02915
#define MCEXEC_UP_SYS_UNSHARE 0x30a02916
#define MCEXEC_UP_UTIL_THREAD1 0x30a02920
#define MCEXEC_UP_UTIL_THREAD2 0x30a02921
#define MCEXEC_UP_SIG_THREAD 0x30a02922
#define MCEXEC_UP_SYSCALL_THREAD 0x30a02924
#define MCEXEC_UP_TERMINATE_THREAD 0x30a02925
#define MCEXEC_UP_GET_NUM_POOL_THREADS 0x30a02926
#define MCEXEC_UP_COPY_FROM_MCK 0x30a03000
#define MCEXEC_UP_COPY_TO_MCK 0x30a03001
#define MCEXEC_UP_DEBUG_LOG 0x40000000
@@ -77,6 +91,26 @@ struct program_image_section {
#define SHELL_PATH_MAX_LEN 1024
#define MCK_RLIM_MAX 20
struct get_cpu_set_arg {
int nr_processes;
void *cpu_set;
size_t cpu_set_size; // Size in bytes
int *target_core;
int *mcexec_linux_numa; // NUMA domain to bind mcexec to
void *mcexec_cpu_set;
size_t mcexec_cpu_set_size; // Size in bytes
int *ikc_mapped;
};
#define PLD_CPU_SET_MAX_CPUS 1024
typedef unsigned long __cpu_set_unit;
#define PLD_CPU_SET_SIZE (PLD_CPU_SET_MAX_CPUS / (8 * sizeof(__cpu_set_unit)))
#define MPOL_NO_HEAP 0x01
#define MPOL_NO_STACK 0x02
#define MPOL_NO_BSS 0x04
#define MPOL_SHM_PREMAP 0x08
struct program_load_desc {
int num_sections;
int status;
@@ -105,7 +139,13 @@ struct program_load_desc {
unsigned long envs_len;
struct rlimit rlimit[MCK_RLIM_MAX];
unsigned long interp_align;
unsigned long mpol_flags;
unsigned long mpol_threshold;
unsigned long heap_extension;
int nr_processes;
char shell_path[SHELL_PATH_MAX_LEN];
__cpu_set_unit cpu_set[PLD_CPU_SET_SIZE];
int profile;
struct program_image_section sections[0];
};
@@ -196,8 +236,42 @@ struct sys_mount_desc {
void *data;
};
struct sys_umount_desc {
char *dir_name;
};
struct sys_unshare_desc {
unsigned long unshare_flags;
};
enum perf_ctrl_type {
PERF_CTRL_SET,
PERF_CTRL_GET,
PERF_CTRL_ENABLE,
PERF_CTRL_DISABLE,
};
struct perf_ctrl_desc {
enum perf_ctrl_type ctrl_type;
int status;
union {
/* for SET, GET */
struct {
unsigned int target_cntr;
unsigned long config;
unsigned long read_value;
unsigned disabled :1,
pinned :1,
exclude_user :1,
exclude_kernel :1,
exclude_hv :1,
exclude_idle :1;
};
/* for START, STOP*/
struct {
unsigned long target_cntr_mask;
};
};
};
#endif

View File

@@ -1,5 +1,5 @@
#include <linux/version.h>
#include "../../config.h"
#include "../../../config.h"
#include "../../mcctrl.h"
#ifdef MCCTRL_KSYM_vdso_image_64
@@ -64,6 +64,10 @@ reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsign
unsigned long start = 0L;
unsigned long end;
if (mutex_lock_killable(&usrdata->reserve_lock) < 0) {
return -1;
}
#define DESIRED_USER_END 0x800000000000
#define GAP_FOR_MCEXEC 0x008000000000UL
end = DESIRED_USER_END;
@@ -81,6 +85,8 @@ reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsign
up_write(&current->mm->mmap_sem);
#endif
mutex_unlock(&usrdata->reserve_lock);
if (IS_ERR_VALUE(start)) {
return start;
}
@@ -190,3 +196,65 @@ out:
ihk_device_unmap_memory(dev, vdso_pa, sizeof(*vdso));
return;
} /* get_vdso_info() */
void *
get_user_sp(void)
{
unsigned long usp;
asm volatile("movq %%gs:0xaf80, %0" : "=r" (usp));
return (void *)usp;
}
void
set_user_sp(void *usp)
{
asm volatile("movq %0, %%gs:0xaf80" :: "r" (usp));
}
struct trans_uctx {
volatile int cond;
int fregsize;
unsigned long rax;
unsigned long rbx;
unsigned long rcx;
unsigned long rdx;
unsigned long rsi;
unsigned long rdi;
unsigned long rbp;
unsigned long r8;
unsigned long r9;
unsigned long r10;
unsigned long r11;
unsigned long r12;
unsigned long r13;
unsigned long r14;
unsigned long r15;
unsigned long rflags;
unsigned long rip;
unsigned long rsp;
unsigned long fs;
};
void
restore_fs(unsigned long fs)
{
wrmsrl(MSR_FS_BASE, fs);
}
void
save_fs_ctx(void *ctx)
{
struct trans_uctx *tctx = ctx;
rdmsrl(MSR_FS_BASE, tctx->fs);
}
unsigned long
get_fs_ctx(void *ctx)
{
struct trans_uctx *tctx = ctx;
return tctx->fs;
}

View File

@@ -75,7 +75,7 @@ static int load_elf(struct linux_binprm *bprm
char buf[32];
int l;
int pass;
char pbuf[1024];
char *pbuf;
const char *path;
if(bprm->envc == 0)
@@ -88,6 +88,11 @@ static int load_elf(struct linux_binprm *bprm
if(elf_ex->e_ident[EI_CLASS] != ELFCLASS64)
return -ENOEXEC;
pbuf = kmalloc(1024, GFP_ATOMIC);
if (!pbuf) {
printk("%s: error: allocating pbuf\n", __FUNCTION__);
return -ENOMEM;
}
path = d_path(&bprm->file->f_path, pbuf, 1024);
if(!path || IS_ERR(path))
path = bprm->interp;
@@ -96,8 +101,10 @@ static int load_elf(struct linux_binprm *bprm
if(!cp ||
!strcmp(cp, "/mcexec") ||
!strcmp(cp, "/ihkosctl") ||
!strcmp(cp, "/ihkconfig"))
!strcmp(cp, "/ihkconfig")) {
kfree(pbuf);
return -ENOEXEC;
}
cnt[0] = bprm->argc;
cnt[1] = bprm->envc;
@@ -124,8 +131,10 @@ static int load_elf(struct linux_binprm *bprm
bprm->p, 1, 0, 1,
&page, NULL);
#endif
if(rc <= 0)
if(rc <= 0) {
kfree(pbuf);
return -EFAULT;
}
addr = kmap_atomic(page
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0)
, KM_USER0
@@ -199,21 +208,27 @@ static int load_elf(struct linux_binprm *bprm
for(ep = env; ep->name; ep++)
if(ep->val)
kfree(ep->val);
if(rc)
if(rc) {
kfree(pbuf);
return -ENOEXEC;
}
file = open_exec(MCEXEC_PATH);
if (IS_ERR(file))
if (IS_ERR(file)) {
kfree(pbuf);
return -ENOEXEC;
}
rc = remove_arg_zero(bprm);
if (rc){
fput(file);
kfree(pbuf);
return rc;
}
rc = copy_strings_kernel(1, &bprm->interp, bprm);
if (rc < 0){
fput(file);
kfree(pbuf);
return rc;
}
bprm->argc++;
@@ -221,12 +236,14 @@ static int load_elf(struct linux_binprm *bprm
rc = copy_strings_kernel(1, &wp, bprm);
if (rc){
fput(file);
kfree(pbuf);
return rc;
}
bprm->argc++;
rc = bprm_change_interp(MCEXEC_PATH, bprm);
if (rc < 0){
fput(file);
kfree(pbuf);
return rc;
}
@@ -236,8 +253,12 @@ static int load_elf(struct linux_binprm *bprm
rc = prepare_binprm(bprm);
if (rc < 0){
kfree(pbuf);
return rc;
}
kfree(pbuf);
return search_binary_handler(bprm
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)
, regs

File diff suppressed because it is too large Load Diff

View File

@@ -27,6 +27,7 @@
#include <linux/slab.h>
#include <linux/device.h>
#include "mcctrl.h"
#include <ihk/ihk_host_user.h>
#define OS_MAX_MINOR 64
@@ -45,6 +46,12 @@ extern void rus_page_hash_put_pages(void);
extern void binfmt_mcexec_init(void);
extern void binfmt_mcexec_exit(void);
extern int mcctrl_os_read_cpu_register(ihk_os_t os, int cpu,
struct ihk_os_cpu_register *desc);
extern int mcctrl_os_write_cpu_register(ihk_os_t os, int cpu,
struct ihk_os_cpu_register *desc);
extern int mcctrl_get_request_os_cpu(ihk_os_t os, int *cpu);
static long mcctrl_ioctl(ihk_os_t os, unsigned int request, void *priv,
unsigned long arg, struct file *file)
{
@@ -60,6 +67,9 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_LOAD_SYSCALL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SEND_SIGNAL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CPU, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_NODES, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CPUSET, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_CREATE_PPD, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_STRNCPY_FROM_USER, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_NEW_PROCESS, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_PREPARE_DMA, .func = mcctrl_ioctl },
@@ -69,8 +79,29 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UMOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_UTIL_THREAD1, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_UTIL_THREAD2, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SIG_THREAD, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYSCALL_THREAD, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_TERMINATE_THREAD, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_NUM_POOL_THREADS, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_DEBUG_LOG, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_COPY_FROM_MCK, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_COPY_TO_MCK, .func = mcctrl_ioctl },
{ .request = IHK_OS_AUX_PERF_NUM, .func = mcctrl_ioctl },
{ .request = IHK_OS_AUX_PERF_SET, .func = mcctrl_ioctl },
{ .request = IHK_OS_AUX_PERF_GET, .func = mcctrl_ioctl },
{ .request = IHK_OS_AUX_PERF_ENABLE, .func = mcctrl_ioctl },
{ .request = IHK_OS_AUX_PERF_DISABLE, .func = mcctrl_ioctl },
{ .request = IHK_OS_AUX_PERF_DESTROY, .func = mcctrl_ioctl },
};
static struct ihk_os_kernel_call_handler mcctrl_kernel_handlers = {
.get_request_cpu = mcctrl_get_request_os_cpu,
.read_cpu_register = mcctrl_os_read_cpu_register,
.write_cpu_register = mcctrl_os_write_cpu_register,
};
static struct ihk_os_user_call mcctrl_uc_proto = {
@@ -107,12 +138,16 @@ int mcctrl_os_boot_notifier(int os_index)
memcpy(mcctrl_uc + os_index, &mcctrl_uc_proto, sizeof mcctrl_uc_proto);
rc = ihk_os_set_kernel_call_handlers(os[os_index], &mcctrl_kernel_handlers);
if (rc < 0) {
printk("mcctrl: error: setting kernel callbacks for OS %d\n", os_index);
goto error_cleanup_channels;
}
rc = ihk_os_register_user_call_handlers(os[os_index], mcctrl_uc + os_index);
if (rc < 0) {
destroy_ikc_channels(os[os_index]);
printk("mcctrl: error: registering callbacks for OS %d\n", os_index);
goto error_cleanup_channels;
goto error_clear_kernel_handlers;
}
procfs_init(os_index);
@@ -120,6 +155,8 @@ int mcctrl_os_boot_notifier(int os_index)
return 0;
error_clear_kernel_handlers:
ihk_os_clear_kernel_call_handlers(os[os_index]);
error_cleanup_channels:
destroy_ikc_channels(os[os_index]);
@@ -129,11 +166,16 @@ error_cleanup_channels:
int mcctrl_os_shutdown_notifier(int os_index)
{
sysfsm_cleanup(os[os_index]);
free_topology_info(os[os_index]);
ihk_os_unregister_user_call_handlers(os[os_index], mcctrl_uc + os_index);
destroy_ikc_channels(os[os_index]);
procfs_exit(os_index);
if (os[os_index]) {
sysfsm_cleanup(os[os_index]);
free_topology_info(os[os_index]);
ihk_os_unregister_user_call_handlers(os[os_index], mcctrl_uc + os_index);
ihk_os_clear_kernel_call_handlers(os[os_index]);
destroy_ikc_channels(os[os_index]);
procfs_exit(os_index);
}
os[os_index] = NULL;
printk("mcctrl: OS ID %d shutdown event handled\n", os_index);
return 0;
@@ -151,11 +193,16 @@ static struct ihk_os_notifier mcctrl_os_notifier = {
static int __init mcctrl_init(void)
{
int ret = 0;
int i;
#ifndef DO_USER_MODE
mcctrl_syscall_init();
#endif
for (i = 0; i < OS_MAX_MINOR; ++i) {
os[i] = NULL;
}
rus_page_hash_init();
binfmt_mcexec_init();

View File

@@ -35,6 +35,16 @@
#define REQUEST_SHIFT 16
//#define DEBUG_IKC
#ifdef DEBUG_IKC
#define dkprintf(...) kprintf(__VA_ARGS__)
#define ekprintf(...) kprintf(__VA_ARGS__)
#else
#define dkprintf(...) do { if (0) printk(__VA_ARGS__); } while (0)
#define ekprintf(...) printk(__VA_ARGS__)
#endif
//int num_channels;
//struct mcctrl_channel *channels;
@@ -43,6 +53,10 @@ void mcexec_prepare_ack(ihk_os_t os, unsigned long arg, int err);
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c);
int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet);
void sig_done(unsigned long arg, int err);
void mcctrl_perf_ack(ihk_os_t os, struct ikc_scd_packet *packet);
void mcctrl_os_read_write_cpu_response(ihk_os_t os,
struct ikc_scd_packet *pisp);
void mcctrl_eventfd(ihk_os_t os, struct ikc_scd_packet *pisp);
/* XXX: this runs in atomic context! */
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
@@ -70,7 +84,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
break;
case SCD_MSG_PROCFS_ANSWER:
procfs_answer(pisp->arg, pisp->err);
procfs_answer(usrdata, pisp->pid);
break;
case SCD_MSG_SEND_SIGNAL:
@@ -99,8 +113,16 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
get_vdso_info(__os, pisp->arg);
break;
case SCD_MSG_REPLY_GET_CPU_MAPPING:
reply_get_cpu_mapping(pisp->arg);
case SCD_MSG_PERF_ACK:
mcctrl_perf_ack(__os, pisp);
break;
case SCD_MSG_CPU_RW_REG_RESP:
mcctrl_os_read_write_cpu_response(__os, pisp);
break;
case SCD_MSG_EVENTFD:
mcctrl_eventfd(__os, pisp);
break;
default:
@@ -116,11 +138,22 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
* mcexec_ret_syscall(), for the rest, free it here.
*/
if (msg != SCD_MSG_SYSCALL_ONESIDE) {
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)__packet, c);
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)__packet,
(usrdata->ikc2linux[smp_processor_id()] ?
usrdata->ikc2linux[smp_processor_id()] :
usrdata->ikc2linux[0]));
}
return 0;
}
static int dummy_packet_handler(struct ihk_ikc_channel_desc *c,
void *__packet, void *__os)
{
kprintf("%s: WARNING: packet received\n", __FUNCTION__);
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)__packet, c);
return 0;
}
int mcctrl_ikc_send(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
@@ -168,186 +201,146 @@ int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu)
}
}
//unsigned long *mcctrl_doorbell_va;
//unsigned long mcctrl_doorbell_pa;
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct ikc_scd_packet packet;
struct mcctrl_channel *pmc = usrdata->channels + cpu;
unsigned long phys;
struct ikc_scd_init_param *rpm;
if(c->port == 502)
if (c->port == 502) {
pmc = usrdata->channels + usrdata->num_channels - 1;
if (!pmc) {
return;
}
printk("IKC init: cpu=%d port=%d\n", cpu, c->port);
phys = ihk_device_map_memory(ihk_os_to_dev(os), rphys,
sizeof(struct ikc_scd_init_param));
#ifdef CONFIG_MIC
rpm = ioremap_wc(phys, sizeof(struct ikc_scd_init_param));
#else
rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys,
sizeof(struct ikc_scd_init_param),
NULL, 0);
#endif
pmc->param.request_va =
(void *)__get_free_pages(in_interrupt() ? GFP_ATOMIC : GFP_KERNEL,
REQUEST_SHIFT - PAGE_SHIFT);
pmc->param.request_pa = virt_to_phys(pmc->param.request_va);
pmc->param.doorbell_va = usrdata->mcctrl_doorbell_va;
pmc->param.doorbell_pa = usrdata->mcctrl_doorbell_pa;
pmc->param.post_va = (void *)__get_free_page(in_interrupt() ?
GFP_ATOMIC : GFP_KERNEL);
pmc->param.post_pa = virt_to_phys(pmc->param.post_va);
memset(pmc->param.doorbell_va, 0, PAGE_SIZE);
memset(pmc->param.request_va, 0, PAGE_SIZE);
memset(pmc->param.post_va, 0, PAGE_SIZE);
pmc->param.response_rpa = rpm->response_page;
pmc->param.response_pa
= ihk_device_map_memory(ihk_os_to_dev(os),
pmc->param.response_rpa,
PAGE_SIZE);
#ifdef CONFIG_MIC
pmc->param.response_va = ioremap_cache(pmc->param.response_pa,
PAGE_SIZE);
#else
pmc->param.response_va = ihk_device_map_virtual(ihk_os_to_dev(os),
pmc->param.response_pa,
PAGE_SIZE, NULL, 0);
#endif
pmc->dma_buf = (void *)__get_free_pages(in_interrupt() ?
GFP_ATOMIC : GFP_KERNEL,
DMA_PIN_SHIFT - PAGE_SHIFT);
rpm->request_page = pmc->param.request_pa;
rpm->doorbell_page = pmc->param.doorbell_pa;
rpm->post_page = pmc->param.post_pa;
if (!pmc) {
kprintf("%s: error: no channel found?\n", __FUNCTION__);
return;
}
packet.msg = SCD_MSG_INIT_CHANNEL_ACKED;
packet.ref = cpu;
packet.arg = rphys;
printk("Request: %lx, Response: %lx, Doorbell: %lx\n",
pmc->param.request_pa, pmc->param.response_rpa,
pmc->param.doorbell_pa);
printk("Request: %p, Response: %p, Doorbell: %p\n",
pmc->param.request_va, pmc->param.response_va,
pmc->param.doorbell_va);
ihk_ikc_send(pmc->c, &packet, 0);
#ifdef CONFIG_MIC
iounmap(rpm);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm,
sizeof(struct ikc_scd_init_param));
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys,
sizeof(struct ikc_scd_init_param));
}
static int connect_handler(struct ihk_ikc_channel_info *param)
static int connect_handler_ikc2linux(struct ihk_ikc_channel_info *param)
{
struct ihk_ikc_channel_desc *c;
int cpu;
ihk_os_t os = param->channel->remote_os;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
int linux_cpu;
c = param->channel;
cpu = c->send.queue->read_cpu;
linux_cpu = c->send.queue->write_cpu;
if (linux_cpu > nr_cpu_ids) {
kprintf("%s: invalid Linux CPU id %d\n",
__FUNCTION__, linux_cpu);
return -1;
}
dkprintf("%s: Linux CPU: %d\n", __FUNCTION__, linux_cpu);
if (cpu < 0 || cpu >= usrdata->num_channels) {
kprintf("Invalid connect source processor: %d\n", cpu);
param->packet_handler = syscall_packet_handler;
usrdata->ikc2linux[linux_cpu] = c;
return 0;
}
static int connect_handler_ikc2mckernel(struct ihk_ikc_channel_info *param)
{
struct ihk_ikc_channel_desc *c;
int mck_cpu;
ihk_os_t os = param->channel->remote_os;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
c = param->channel;
mck_cpu = c->send.queue->read_cpu;
if (mck_cpu < 0 || mck_cpu >= usrdata->num_channels) {
kprintf("Invalid connect source processor: %d\n", mck_cpu);
return 1;
}
param->packet_handler = syscall_packet_handler;
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
param->packet_handler = dummy_packet_handler;
usrdata->channels[mck_cpu].c = c;
return 0;
}
static int connect_handler2(struct ihk_ikc_channel_info *param)
{
struct ihk_ikc_channel_desc *c;
int cpu;
ihk_os_t os = param->channel->remote_os;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
c = param->channel;
cpu = usrdata->num_channels - 1;
param->packet_handler = syscall_packet_handler;
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
return 0;
}
static struct ihk_ikc_listen_param listen_param = {
.port = 501,
.handler = connect_handler,
static struct ihk_ikc_listen_param lp_ikc2linux = {
.port = 503,
.ikc_direction = IHK_IKC_DIRECTION_RECV,
.handler = connect_handler_ikc2linux,
.pkt_size = sizeof(struct ikc_scd_packet),
.queue_size = PAGE_SIZE,
.queue_size = PAGE_SIZE * 4,
.magic = 0x1129,
};
static struct ihk_ikc_listen_param listen_param2 = {
.port = 502,
.handler = connect_handler2,
static struct ihk_ikc_listen_param lp_ikc2mckernel = {
.port = 501,
.ikc_direction = IHK_IKC_DIRECTION_SEND,
.handler = connect_handler_ikc2mckernel,
.pkt_size = sizeof(struct ikc_scd_packet),
.queue_size = PAGE_SIZE,
.queue_size = PAGE_SIZE * 4,
.magic = 0x1329,
};
int prepare_ikc_channels(ihk_os_t os)
{
struct ihk_cpu_info *info;
struct mcctrl_usrdata *usrdata;
struct mcctrl_usrdata *usrdata;
int i;
int ret = 0;
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_KERNEL);
usrdata->mcctrl_doorbell_va = (void *)__get_free_page(GFP_KERNEL);
usrdata->mcctrl_doorbell_pa = virt_to_phys(usrdata->mcctrl_doorbell_va);
info = ihk_os_get_cpu_info(os);
if (!info) {
printk("Error: cannot retrieve CPU info.\n");
return -EINVAL;
}
if (info->n_cpus < 1) {
printk("Error: # of cpu is invalid.\n");
return -EINVAL;
if (!usrdata) {
printk("%s: error: allocating mcctrl_usrdata\n", __FUNCTION__);
ret = -ENOMEM;
goto error;
}
usrdata->num_channels = info->n_cpus + 1;
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) * usrdata->num_channels,
GFP_KERNEL);
usrdata->cpu_info = ihk_os_get_cpu_info(os);
usrdata->mem_info = ihk_os_get_memory_info(os);
if (!usrdata->cpu_info || !usrdata->mem_info) {
printk("%s: cannot obtain OS CPU and memory information.\n",
__FUNCTION__);
ret = -EINVAL;
goto error;
}
if (usrdata->cpu_info->n_cpus < 1) {
printk("%s: Error: # of cpu is invalid.\n", __FUNCTION__);
ret = -EINVAL;
goto error;
}
usrdata->num_channels = usrdata->cpu_info->n_cpus;
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) *
usrdata->num_channels,
GFP_KERNEL);
if (!usrdata->channels) {
printk("Error: cannot allocate channels.\n");
return -ENOMEM;
ret = -ENOMEM;
goto error;
}
usrdata->ikc2linux = kzalloc(sizeof(struct ihk_ikc_channel_desc *) *
nr_cpu_ids, GFP_KERNEL);
if (!usrdata->ikc2linux) {
printk("Error: cannot allocate ikc2linux channels.\n");
ret = -ENOMEM;
goto error;
}
usrdata->os = os;
init_waitqueue_head(&usrdata->wq_prepare);
ihk_host_os_set_usrdata(os, usrdata);
memcpy(&usrdata->listen_param, &listen_param, sizeof listen_param);
ihk_ikc_listen_port(os, &usrdata->listen_param);
memcpy(&usrdata->listen_param2, &listen_param2, sizeof listen_param2);
ihk_ikc_listen_port(os, &usrdata->listen_param2);
ihk_ikc_listen_port(os, &lp_ikc2linux);
ihk_ikc_listen_port(os, &lp_ikc2mckernel);
init_waitqueue_head(&usrdata->wq_procfs);
mutex_init(&usrdata->reserve_lock);
for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) {
INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]);
@@ -357,25 +350,25 @@ int prepare_ikc_channels(ihk_os_t os)
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
INIT_LIST_HEAD(&usrdata->node_topology_list);
mutex_init(&usrdata->part_exec.lock);
INIT_LIST_HEAD(&usrdata->part_exec.pli_list);
usrdata->part_exec.nr_processes = -1;
return 0;
error:
if (usrdata) {
if (usrdata->channels) kfree(usrdata->channels);
if (usrdata->ikc2linux) kfree(usrdata->ikc2linux);
kfree(usrdata);
}
return ret;
}
void __destroy_ikc_channel(ihk_os_t os, struct mcctrl_channel *pmc)
{
free_pages((unsigned long)pmc->param.request_va,
REQUEST_SHIFT - PAGE_SHIFT);
free_page((unsigned long)pmc->param.post_va);
#ifdef CONFIG_MIC
iounmap(pmc->param.response_va);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), pmc->param.response_va,
PAGE_SIZE);
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os),
pmc->param.response_pa, PAGE_SIZE);
free_pages((unsigned long)pmc->dma_buf,
DMA_PIN_SHIFT - PAGE_SHIFT);
return;
}
void destroy_ikc_channels(ihk_os_t os)
@@ -383,18 +376,32 @@ void destroy_ikc_channels(ihk_os_t os)
int i;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
if (!usrdata) {
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
return;
}
ihk_host_os_set_usrdata(os, NULL);
for (i = 0; i < usrdata->num_channels; i++) {
if (usrdata->channels[i].c) {
// ihk_ikc_disconnect(usrdata->channels[i].c);
ihk_ikc_free_channel(usrdata->channels[i].c);
__destroy_ikc_channel(os, usrdata->channels + i);
printk("Channel #%d freed.\n", i);
ihk_ikc_destroy_channel(usrdata->channels[i].c);
}
}
for (i = 0; i < nr_cpu_ids; i++) {
if (usrdata->ikc2linux[i]) {
ihk_ikc_destroy_channel(usrdata->ikc2linux[i]);
}
}
free_page((unsigned long)usrdata->mcctrl_doorbell_va);
kfree(usrdata->channels);
kfree(usrdata->ikc2linux);
kfree(usrdata);
}
void
mcctrl_eventfd(ihk_os_t os, struct ikc_scd_packet *pisp)
{
ihk_os_eventfd(os, 0);
}

View File

@@ -59,8 +59,8 @@
#define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_GET_VDSO_INFO 0xa
#define SCD_MSG_GET_CPU_MAPPING 0xc
#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
//#define SCD_MSG_GET_CPU_MAPPING 0xc
//#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
#define SCD_MSG_PROCFS_CREATE 0x10
#define SCD_MSG_PROCFS_DELETE 0x11
@@ -92,6 +92,14 @@
#define SCD_MSG_PROCFS_TID_CREATE 0x44
#define SCD_MSG_PROCFS_TID_DELETE 0x45
#define SCD_MSG_EVENTFD 0x46
#define SCD_MSG_PERF_CTRL 0x50
#define SCD_MSG_PERF_ACK 0x51
#define SCD_MSG_CPU_RW_REG 0x52
#define SCD_MSG_CPU_RW_REG_RESP 0x53
#define DMA_PIN_SHIFT 21
#define DO_USER_MODE
@@ -103,6 +111,12 @@ struct coretable {
unsigned long addr;
};
enum mcctrl_os_cpu_operation {
MCCTRL_OS_CPU_READ_REGISTER,
MCCTRL_OS_CPU_WRITE_REGISTER,
MCCTRL_OS_CPU_MAX_OP
};
struct ikc_scd_packet {
int msg;
int err;
@@ -128,6 +142,13 @@ struct ikc_scd_packet {
struct {
int ttid;
};
/* SCD_MSG_CPU_RW_REG */
struct {
struct ihk_os_cpu_register desc;
enum mcctrl_os_cpu_operation op;
void *resp;
};
};
char padding[12];
};
@@ -172,7 +193,6 @@ struct wait_queue_head_list_node {
struct mcctrl_channel {
struct ihk_ikc_channel_desc *c;
struct syscall_params param;
struct ikc_scd_init_param init;
void *dma_buf;
};
@@ -188,17 +208,27 @@ struct mcctrl_per_thread_data {
#define MCCTRL_PER_THREAD_DATA_HASH_MASK (MCCTRL_PER_THREAD_DATA_HASH_SIZE - 1)
struct mcctrl_per_proc_data {
struct mcctrl_usrdata *ud;
struct list_head hash;
int pid;
unsigned long rpgtable; /* per process, not per OS */
struct list_head wq_list;
struct list_head wq_req_list;
struct list_head wq_list_exact;
struct list_head wq_list; /* All these requests come from mcexec */
struct list_head wq_req_list; /* These requests come from IKC IRQ handler (can be processed by any threads) */
struct list_head wq_list_exact; /* These requests come from IKC IRQ handler targeting a particular thread */
ihk_spinlock_t wq_list_lock;
wait_queue_head_t wq_prepare;
wait_queue_head_t wq_procfs;
struct list_head per_thread_data_hash[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
rwlock_t per_thread_data_hash_lock[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
cpumask_t cpu_set;
int ikc_target_cpu;
atomic_t refcount;
struct list_head devobj_pager_list;
struct semaphore devobj_pager_lock;
};
struct sysfsm_req {
@@ -226,11 +256,6 @@ static inline int sysfs_inited(struct sysfsm_data *sdp)
return !!(sdp->sysfs_buf);
} /* sysfs_inited() */
struct cpu_mapping {
int cpu_number;
int hw_id;
};
struct cache_topology {
struct ihk_cache_topology *saved;
cpumask_t shared_cpu_map;
@@ -239,8 +264,9 @@ struct cache_topology {
};
struct cpu_topology {
struct cpu_mapping *cpu_mapping;
//struct mcctrl_usrdata *udp;
struct ihk_cpu_topology *saved;
int mckernel_cpu_id;
cpumask_t core_siblings;
cpumask_t thread_siblings;
@@ -248,13 +274,32 @@ struct cpu_topology {
struct list_head cache_list;
};
#define NODE_DISTANCE_S_SIZE 1024
struct node_topology {
struct ihk_node_topology *saved;
int mckernel_numa_id;
char mckernel_numa_distance_s[NODE_DISTANCE_S_SIZE];
cpumask_t cpumap;
struct list_head chain;
};
struct process_list_item {
int ready;
struct task_struct *task;
struct list_head list;
wait_queue_head_t pli_wq;
};
struct mcctrl_part_exec {
struct mutex lock;
int nr_processes;
int nr_processes_left;
cpumask_t cpus_used;
struct list_head pli_list;
};
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
#define MCCTRL_PER_PROC_DATA_HASH_SHIFT 7
@@ -266,28 +311,30 @@ struct mcctrl_usrdata {
struct ihk_ikc_listen_param listen_param2;
ihk_os_t os;
int num_channels;
struct mcctrl_channel *channels;
unsigned long *mcctrl_doorbell_va;
unsigned long mcctrl_doorbell_pa;
/* Channels used for sending messages to LWK */
struct mcctrl_channel *channels;
/* Channels used for receiving messages from LWK */
struct ihk_ikc_channel_desc **ikc2linux;
int remaining_job;
int base_cpu;
int job_pos;
int mcctrl_dma_abort;
struct mutex reserve_lock;
unsigned long last_thread_exec;
wait_queue_head_t wq_prepare;
wait_queue_head_t wq_procfs;
struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE];
rwlock_t per_proc_data_hash_lock[MCCTRL_PER_PROC_DATA_HASH_SIZE];
void **keys;
struct sysfsm_data sysfsm_data;
unsigned long cpu_online[CPU_LONGS];
int cpu_mapping_elems;
int padding;
struct cpu_mapping *cpu_mapping;
long cpu_mapping_pa;
struct ihk_cpu_info *cpu_info;
struct ihk_mem_info *mem_info;
nodemask_t numa_online;
struct list_head cpu_topology_list;
struct list_head node_topology_list;
struct mcctrl_part_exec part_exec;
int perf_event_num;
};
struct mcctrl_signal {
@@ -305,12 +352,16 @@ int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu);
ihk_os_t osnum_to_os(int n);
/* syscall.c */
void pager_add_process(void);
void pager_remove_process(struct mcctrl_per_proc_data *ppd);
int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet);
int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid,
struct mcctrl_per_proc_data *ppd);
int mcctrl_delete_per_proc_data(struct mcctrl_usrdata *ud, int pid);
inline struct mcctrl_per_proc_data *mcctrl_get_per_proc_data(
struct mcctrl_per_proc_data *mcctrl_get_per_proc_data(
struct mcctrl_usrdata *ud, int pid);
void mcctrl_put_per_proc_data(struct mcctrl_per_proc_data *ppd);
int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data* ppd,
struct task_struct *task, void *data);
@@ -322,7 +373,7 @@ inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
long ret, int stid);
#define PROCFS_NAME_MAX 1000
#define PROCFS_NAME_MAX 768
struct procfs_read {
unsigned long pbuf; /* physical address of the host buffer (request) */
@@ -342,7 +393,7 @@ struct procfs_file {
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
void procfs_answer(unsigned int arg, int err);
void procfs_answer(struct mcctrl_usrdata *ud, int pid);
int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg);
void add_tid_entry(int osnum, int pid, int tid);
void add_pid_entry(int osnum, int pid);
@@ -390,4 +441,14 @@ struct get_cpu_mapping_req {
wait_queue_head_t wq;
};
struct ihk_perf_event_attr{
unsigned long config;
unsigned disabled:1;
unsigned pinned:1;
unsigned exclude_user:1;
unsigned exclude_kernel:1;
unsigned exclude_hv:1;
unsigned exclude_idle:1;
};
#endif

View File

@@ -59,7 +59,6 @@ static const struct procfs_entry base_entry_stuff[];
static const struct file_operations mckernel_forward_ro;
static const struct file_operations mckernel_forward;
static DECLARE_WAIT_QUEUE_HEAD(procfsq);
static ssize_t mckernel_procfs_read(struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos);
@@ -106,14 +105,28 @@ getpath(struct procfs_list_entry *e, char *buf, int bufsize)
/**
* \brief Process SCD_MSG_PROCFS_ANSWER message.
*
* \param arg sent argument
* \param err error info (redundant)
* \param ud mcctrl_usrdata pointer
* \param pid PID of the requesting process
*/
void
procfs_answer(unsigned int arg, int err)
void procfs_answer(struct mcctrl_usrdata *ud, int pid)
{
dprintk("procfs: received SCD_MSG_PROCFS_ANSWER message(err = %d).\n", err);
wake_up_interruptible(&procfsq);
struct mcctrl_per_proc_data *ppd = NULL;
if (pid > 0) {
ppd = mcctrl_get_per_proc_data(ud, pid);
if (unlikely(!ppd)) {
kprintf("%s: ERROR: no per-process structure for PID %d\n",
__FUNCTION__, pid);
return;
}
}
wake_up_all(pid > 0 ? &ppd->wq_procfs : &ud->wq_procfs);
if (pid > 0) {
mcctrl_put_per_proc_data(ppd);
}
}
static struct procfs_list_entry *
@@ -248,9 +261,11 @@ get_pid_cred(int pid)
{
struct task_struct *task = NULL;
if(pid > 0){
if (pid > 0) {
rcu_read_lock();
task = pid_task(find_vpid(pid), PIDTYPE_PID);
if(task){
rcu_read_unlock();
if (task) {
return __task_cred(task);
}
}
@@ -481,8 +496,9 @@ procfs_exit(int osnum)
down(&procfs_file_list_lock);
e = find_base_entry(osnum);
if(e)
if (e) {
delete_procfs_entries(e);
}
up(&procfs_file_list_lock);
}
@@ -492,36 +508,84 @@ procfs_exit(int osnum)
* This function conforms to the 2) way of fs/proc/generic.c
* from linux-2.6.39.4.
*/
static ssize_t
mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
loff_t *ppos)
static ssize_t __mckernel_procfs_read_write(
struct file *file,
char __user *buf, size_t nbytes,
loff_t *ppos, int read_write)
{
struct inode * inode = file->f_path.dentry->d_inode;
struct inode * inode = file->f_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;
struct ikc_scd_packet isp;
int ret;
int ret, osnum, pid, retw;
unsigned long pbuf;
unsigned long count = nbytes;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct proc_dir_entry *dp = PDE(inode);
struct procfs_list_entry *e = dp->data;
#else
#else
struct procfs_list_entry *e = PDE_DATA(inode);
#endif
#endif
loff_t offset = *ppos;
char pathbuf[PROCFS_NAME_MAX];
char *path;
char *path, *p;
ihk_os_t os = NULL;
struct mcctrl_usrdata *udp = NULL;
struct mcctrl_per_proc_data *ppd = NULL;
path = getpath(e, pathbuf, 256);
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
path, offset, count);
if (count <= 0 || offset < 0) {
return 0;
}
path = getpath(e, pathbuf, PROCFS_NAME_MAX);
dprintk("%s: invoked for %s, offset: %lu, count: %lu\n",
__FUNCTION__, path,
(unsigned long)offset, count);
/* Verify OS number */
ret = sscanf(path, "mcos%d/", &osnum);
if (ret != 1) {
printk("%s: error: couldn't determine OS number\n", __FUNCTION__);
return -EINVAL;
}
if (osnum != e->osnum) {
printk("%s: error: OS numbers don't match\n", __FUNCTION__);
return -EINVAL;
}
/* Is this request for a specific process? */
p = strchr(path, '/') + 1;
ret = sscanf(p, "%d/", &pid);
if (ret != 1) {
pid = -1;
}
os = osnum_to_os(osnum);
if (!os) {
printk("%s: error: no IHK OS data found for OS %d\n",
__FUNCTION__, osnum);
return -EINVAL;
}
udp = ihk_host_os_get_usrdata(os);
if (!udp) {
printk("%s: error: no MCCTRL data found for OS %d\n",
__FUNCTION__, osnum);
return -EINVAL;
}
if (pid > 0) {
ppd = mcctrl_get_per_proc_data(udp, pid);
if (unlikely(!ppd)) {
printk("%s: error: no per-process structure for PID %d",
__FUNCTION__, pid);
return -EINVAL;
}
}
while ((1 << order) < count) ++order;
if (order > 12) {
order -= 12;
@@ -533,10 +597,11 @@ mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
/* NOTE: we need physically contigous memory to pass through IKC */
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
if (!kern_buffer) {
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
return -ENOMEM;
printk("%s: ERROR: allocating kernel buffer\n", __FUNCTION__);
ret = -ENOMEM;
goto out;
}
pbuf = virt_to_phys(kern_buffer);
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
@@ -550,152 +615,96 @@ mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
r->status = 0;
r->offset = offset;
r->count = count;
r->readwrite = 0;
r->readwrite = read_write;
strncpy((char *)r->fname, path, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = 0;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(osnum_to_os(e->osnum), 0, &isp);
isp.pid = pid;
ret = mcctrl_ikc_send(osnum_to_os(e->osnum),
(pid > 0) ? ppd->ikc_target_cpu : 0, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
dprintk("%s: waiting for reply\n", __FUNCTION__);
retry_wait:
/* Wait for the status field of the procfs_read structure,
* wait on per-process or OS specific data depending on
* who the request is for.
*/
if (pid > 0) {
retw = wait_event_interruptible_timeout(ppd->wq_procfs,
r->status != 0, 5 * HZ);
}
else {
retw = wait_event_interruptible_timeout(udp->wq_procfs,
r->status != 0, 5 * HZ);
}
/* Timeout? */
if (retw == 0 && r->status == 0) {
printk("%s: error: timeout (1 sec)\n", __FUNCTION__);
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if (r->ret > 0) {
if (copy_to_user(buf, kern_buffer, r->ret)) {
kprintf("ERROR: mckernel_procfs_read: copy_to_user failed.\n");
ret = -EFAULT;
goto out;
}
/* Interrupted? */
else if (retw == -ERESTARTSYS) {
ret = -ERESTART;
goto out;
}
/* Were we woken up by a reply to another procfs request? */
else if (r->status == 0) {
/* TODO: r->status is not set atomically, we could be woken
* up with status == 0 and it could change to 1 while in this
* code, we could potentially miss the wake_up()...
*/
printk("%s: stale wake-up, retrying\n", __FUNCTION__);
goto retry_wait;
}
/* Wake up and check the result. */
dprintk("%s: woke up. ret: %d, eof: %d\n",
__FUNCTION__, r->ret, r->eof);
if (r->ret > 0) {
if (read_write == 0) {
if (copy_to_user(buf, kern_buffer, r->ret)) {
printk("%s: ERROR: copy_to_user failed.\n", __FUNCTION__);
ret = -EFAULT;
goto out;
}
}
*ppos += r->ret;
}
ret = r->ret;
out:
if(kern_buffer)
if (ppd)
mcctrl_put_per_proc_data(ppd);
if (kern_buffer)
free_pages((uintptr_t)kern_buffer, order);
if(r)
if (r)
kfree((void *)r);
return ret;
}
static ssize_t
mckernel_procfs_write(struct file *file, const char __user *buf, size_t nbytes,
loff_t *ppos)
static ssize_t mckernel_procfs_read(struct file *file,
char __user *buf, size_t nbytes, loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;
struct ikc_scd_packet isp;
int ret;
unsigned long pbuf;
unsigned long count = nbytes;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct proc_dir_entry *dp = PDE(inode);
struct procfs_list_entry *e = dp->data;
#else
struct procfs_list_entry *e = PDE_DATA(inode);
#endif
loff_t offset = *ppos;
char pathbuf[PROCFS_NAME_MAX];
char *path;
return __mckernel_procfs_read_write(file, buf, nbytes, ppos, 0);
}
path = getpath(e, pathbuf, 256);
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
path, offset, count);
if (count <= 0 || offset < 0) {
return 0;
}
while ((1 << order) < count) ++order;
if (order > 12) {
order -= 12;
}
else {
order = 1;
}
/* NOTE: we need physically contigous memory to pass through IKC */
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
if (!kern_buffer) {
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
return -ENOMEM;
}
if (copy_from_user(kern_buffer, buf, nbytes)) {
ret = -EFAULT;
goto out;
}
pbuf = virt_to_phys(kern_buffer);
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
ret = -ENOMEM;
goto out;
}
dprintk("offset: %lx, count: %d, cpu: %d\n", offset, count, e->cpu);
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
r->readwrite = 1;
strncpy((char *)r->fname, path, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = 0;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(osnum_to_os(e->osnum), 0, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if (r->ret > 0) {
*ppos += r->ret;
}
ret = r->ret;
out:
if(kern_buffer)
free_pages((uintptr_t)kern_buffer, order);
if(r)
kfree((void *)r);
return ret;
static ssize_t mckernel_procfs_write(struct file *file,
const char __user *buf, size_t nbytes, loff_t *ppos)
{
return __mckernel_procfs_read_write(file,
(char __user *)buf, nbytes, ppos, 1);
}
static loff_t

View File

@@ -45,7 +45,7 @@
#include <asm/uaccess.h>
#include <asm/delay.h>
#include <asm/io.h>
#include "../../config.h"
#include "../../../config.h"
#include "mcctrl.h"
#include <linux/version.h>
@@ -278,9 +278,178 @@ static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet
return ret;
}
long syscall_backward(struct mcctrl_usrdata *usrdata, int num,
unsigned long arg1, unsigned long arg2,
unsigned long arg3, unsigned long arg4,
unsigned long arg5, unsigned long arg6,
unsigned long *ret)
{
struct ikc_scd_packet *packet;
struct syscall_request *req;
struct syscall_response *resp;
unsigned long syscall_ret;
struct wait_queue_head_list_node *wqhln;
unsigned long irqflags;
struct mcctrl_per_proc_data *ppd;
unsigned long phys;
struct syscall_request _request[2];
struct syscall_request *request;
if (((unsigned long)_request ^ (unsigned long)(_request + 1)) &
~(PAGE_SIZE -1))
request = _request + 1;
else
request = _request;
request->number = num;
request->args[0] = arg1;
request->args[1] = arg2;
request->args[2] = arg3;
request->args[3] = arg4;
request->args[4] = arg5;
request->args[5] = arg6;
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return -EINVAL;
}
packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current);
if (!packet) {
syscall_ret = -ENOENT;
printk("%s: no packet registered for TID %d\n",
__FUNCTION__, task_pid_vnr(current));
goto out_put_ppd;
}
req = &packet->req;
/* Map response structure */
phys = ihk_device_map_memory(ihk_os_to_dev(usrdata->os),
packet->resp_pa, sizeof(*resp));
resp = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os),
phys, sizeof(*resp), NULL, 0);
retry_alloc:
wqhln = kmalloc(sizeof(*wqhln), GFP_ATOMIC);
if (!wqhln) {
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
goto retry_alloc;
}
/* Prepare per-thread wait queue head */
wqhln->task = current;
/* Save the TID explicitly, because mcexec_syscall(), where the request
* will be matched, is in IRQ context and can't call task_pid_vnr() */
wqhln->rtid = task_pid_vnr(current);
wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall);
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
/* Add to exact list */
list_add_tail(&wqhln->list, &ppd->wq_list_exact);
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
resp->stid = task_pid_vnr(current);
resp->fault_address = virt_to_phys(request);
#define STATUS_IN_PROGRESS 0
#define STATUS_SYSCALL 4
req->valid = 0;
if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) {
printk("%s: WARNING: failed to notify PID %d\n",
__FUNCTION__, packet->pid);
}
mb();
resp->status = STATUS_SYSCALL;
dprintk("%s: tid: %d, syscall: %d SLEEPING\n",
__FUNCTION__, task_pid_vnr(current), num);
/* wait for response */
syscall_ret = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
/* Remove per-thread wait queue head */
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
list_del(&wqhln->list);
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
dprintk("%s: tid: %d, syscall: %d WOKEN UP\n",
__FUNCTION__, task_pid_vnr(current), num);
if (syscall_ret) {
kfree(wqhln);
goto out;
}
else {
unsigned long phys2;
struct syscall_response *resp2;
/* Update packet reference */
packet = wqhln->packet;
req = &packet->req;
phys2 = ihk_device_map_memory(ihk_os_to_dev(usrdata->os),
packet->resp_pa, sizeof(*resp));
resp2 = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os),
phys2, sizeof(*resp), NULL, 0);
if (resp != resp2) {
resp = resp2;
phys = phys2;
printk("%s: updated new remote PA for resp\n", __FUNCTION__);
}
}
if (!req->valid) {
printk("%s:not valid\n", __FUNCTION__);
}
req->valid = 0;
/* check result */
if (req->number != __NR_mmap) {
printk("%s:unexpected response. %lx %lx\n",
__FUNCTION__, req->number, req->args[0]);
syscall_ret = -EIO;
goto out;
}
#define PAGER_REQ_RESUME 0x0101
else if (req->args[0] != PAGER_REQ_RESUME) {
resp->ret = pager_call(usrdata->os, (void *)req);
if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) {
printk("%s: WARNING: failed to notify PID %d\n",
__FUNCTION__, packet->pid);
}
mb();
}
else {
*ret = req->args[1];
}
kfree(wqhln);
syscall_ret = 0;
out:
ihk_device_unmap_virtual(ihk_os_to_dev(usrdata->os), resp, sizeof(*resp));
ihk_device_unmap_memory(ihk_os_to_dev(usrdata->os), phys, sizeof(*resp));
out_put_ppd:
dprintk("%s: tid: %d, syscall: %d, reason: %lu, syscall_ret: %d\n",
__FUNCTION__, task_pid_vnr(current), num, reason, syscall_ret);
mcctrl_put_per_proc_data(ppd);
return syscall_ret;
}
static int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, uint64_t reason)
{
struct ikc_scd_packet *packet;
struct ikc_scd_packet *free_packet = NULL;
struct syscall_request *req;
struct syscall_response *resp;
int error;
@@ -306,7 +475,7 @@ static int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, u
error = -ENOENT;
printk("%s: no packet registered for TID %d\n",
__FUNCTION__, task_pid_vnr(current));
goto out_no_unmap;
goto out_put_ppd;
}
req = &packet->req;
@@ -316,6 +485,12 @@ static int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, u
packet->resp_pa, sizeof(*resp));
resp = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os),
phys, sizeof(*resp), NULL, 0);
if (!resp) {
printk("%s: ERROR: invalid response structure address\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
retry_alloc:
wqhln = kmalloc(sizeof(*wqhln), GFP_ATOMIC);
@@ -326,6 +501,9 @@ retry_alloc:
/* Prepare per-thread wait queue head */
wqhln->task = current;
/* Save the TID explicitly, because mcexec_syscall(), where the request
* will be matched, is in IRQ context and can't call task_pid_vnr() */
wqhln->rtid = task_pid_vnr(current);
wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall);
@@ -374,6 +552,7 @@ retry_alloc:
else {
/* Update packet reference */
packet = wqhln->packet;
free_packet = packet;
req = &packet->req;
{
unsigned long phys2;
@@ -431,12 +610,20 @@ retry_alloc:
kfree(wqhln);
error = 0;
out:
/* Release remote page-fault response packet */
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)free_packet,
(usrdata->ikc2linux[smp_processor_id()] ?
usrdata->ikc2linux[smp_processor_id()] :
usrdata->ikc2linux[0]));
ihk_device_unmap_virtual(ihk_os_to_dev(usrdata->os), resp, sizeof(*resp));
ihk_device_unmap_memory(ihk_os_to_dev(usrdata->os), phys, sizeof(*resp));
out_no_unmap:
out_put_ppd:
dprintk("%s: tid: %d, fault_addr: %lu, reason: %lu, error: %d\n",
__FUNCTION__, task_pid_vnr(current), fault_addr, reason, error);
mcctrl_put_per_proc_data(ppd);
return error;
}
@@ -574,23 +761,34 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
size_t pix;
#endif
struct mcctrl_per_proc_data *ppd;
struct ikc_scd_packet *packet;
int ret = 0;
dprintk("mcctrl:page fault:flags %#x pgoff %#lx va %p page %p\n",
vmf->flags, vmf->pgoff, vmf->virtual_address, vmf->page);
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) {
ppd = mcctrl_get_per_proc_data(usrdata, vma->vm_mm->owner->pid);
}
if (!ppd) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
printk("mcctrl:page fault:flags %#x pgoff %#lx va %p page %p\n",
vmf->flags, vmf->pgoff, vmf->virtual_address, vmf->page);
return -EINVAL;
}
packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current);
if (!packet) {
error = -ENOENT;
printk("%s: no packet registered for TID %d\n",
__FUNCTION__, task_pid_vnr(current));
goto put_and_out;
}
for (try = 1; ; ++try) {
error = translate_rva_to_rpa(usrdata->os, ppd->rpgtable,
(unsigned long)vmf->virtual_address,
@@ -598,7 +796,10 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
#define NTRIES 2
if (!error || (try >= NTRIES)) {
if (error) {
printk("translate_rva_to_rpa: error\n");
printk("%s: error translating 0x%p "
"(req: TID: %u, syscall: %lu)\n",
__FUNCTION__, vmf->virtual_address,
packet->req.rtid, packet->req.number);
}
break;
@@ -611,14 +812,16 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
error = remote_page_fault(usrdata, vmf->virtual_address, reason);
if (error) {
printk("forward_page_fault failed. %d\n", error);
printk("%s: error forwarding PF for 0x%p "
"(req: TID: %d, syscall: %lu)\n",
__FUNCTION__, vmf->virtual_address,
packet->req.rtid, packet->req.number);
break;
}
}
if (error) {
printk("mcctrl:page fault error:flags %#x pgoff %#lx va %p page %p\n",
vmf->flags, vmf->pgoff, vmf->virtual_address, vmf->page);
return VM_FAULT_SIGBUS;
ret = VM_FAULT_SIGBUS;
goto put_and_out;
}
rva = (unsigned long)vmf->virtual_address & ~(pgsize - 1);
@@ -630,16 +833,30 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
for (pix = 0; pix < (pgsize / PAGE_SIZE); ++pix) {
struct page *page;
/* LWK may hold large page based mappings that align rva outside
* Linux' VMA, make sure we don't try to map to those pages */
if (rva + (pix * PAGE_SIZE) < vma->vm_start) {
continue;
}
if (pfn_valid(pfn+pix)) {
page = pfn_to_page(pfn+pix);
if ((error = rus_page_hash_insert(page)) < 0) {
printk("rus_vm_fault: error hashing page??\n");
printk("%s: error adding page to RUS hash for 0x%p "
"(req: TID: %d, syscall: %lu)\n",
__FUNCTION__, vmf->virtual_address,
packet->req.rtid, packet->req.number);
}
error = vm_insert_page(vma, rva+(pix*PAGE_SIZE), page);
if (error) {
printk("vm_insert_page: %d\n", error);
printk("%s: error inserting mapping for 0x%p "
"(req: TID: %d, syscall: %lu) error: %d, "
"vm_start: 0x%lx, vm_end: 0x%lx\n",
__FUNCTION__, vmf->virtual_address,
packet->req.rtid, packet->req.number, error,
vma->vm_start, vma->vm_end);
}
}
else
@@ -653,12 +870,19 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
#endif
ihk_device_unmap_memory(dev, phys, pgsize);
if (error) {
printk("mcctrl:page fault:remap error:flags %#x pgoff %#lx va %p page %p\n",
vmf->flags, vmf->pgoff, vmf->virtual_address, vmf->page);
return VM_FAULT_SIGBUS;
printk("%s: remote PF failed for 0x%p, pgoff: %lu "
"(req: TID: %d, syscall: %lu)\n",
__FUNCTION__, vmf->virtual_address, vmf->pgoff,
packet->req.rtid, packet->req.number);
ret = VM_FAULT_SIGBUS;
goto put_and_out;
}
return VM_FAULT_NOPAGE;
ret = VM_FAULT_NOPAGE;
put_and_out:
mcctrl_put_per_proc_data(ppd);
return ret;
}
static struct vm_operations_struct rus_vmops = {
@@ -705,11 +929,11 @@ reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, u
original = override_creds(promoted);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
start = vm_mmap_pgoff(file, start, end,
PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, 0);
start = vm_mmap_pgoff(file, start, end, PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_FIXED|MAP_SHARED, 0);
#else
start = vm_mmap(file, start, end,
PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, 0);
start = vm_mmap(file, start, end, PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_FIXED|MAP_SHARED, 0);
#endif
revert_creds(original);
@@ -743,9 +967,99 @@ struct pager {
static DEFINE_SEMAPHORE(pager_sem);
static struct list_head pager_list = LIST_HEAD_INIT(pager_list);
int pager_nr_processes = 0;
void pager_add_process(void)
{
int error;
error = down_interruptible(&pager_sem);
if (error) {
return;
}
++pager_nr_processes;
up(&pager_sem);
}
void pager_remove_process(struct mcctrl_per_proc_data *ppd)
{
int error;
struct pager *pager_next, *pager;
if (in_atomic() || in_interrupt()) {
printk("%s: WARNING: shouldn't be called in IRQ context..\n",
__FUNCTION__);
return;
}
/* Clean up device file mappings of this process */
error = down_interruptible(&ppd->devobj_pager_lock);
if (error) {
return;
}
list_for_each_entry_safe(pager, pager_next,
&ppd->devobj_pager_list, list) {
dprintk("%s: devobj pager 0x%lx removed\n", __FUNCTION__, pager);
list_del(&pager->list);
kfree(pager);
}
up(&ppd->devobj_pager_lock);
/* Clean up global pagers for regular file mappings if this
* was the last process */
error = down_interruptible(&pager_sem);
if (error) {
return;
}
--pager_nr_processes;
if (pager_nr_processes > 0) {
goto out;
}
list_for_each_entry_safe(pager, pager_next, &pager_list, list) {
list_del(&pager->list);
if (pager->rofile) {
fput(pager->rofile);
}
if (pager->rwfile) {
fput(pager->rwfile);
}
dprintk("%s: pager 0x%lx removed\n", __FUNCTION__, pager);
kfree(pager);
}
/* Flush page hash as well */
rus_page_hash_put_pages();
out:
up(&pager_sem);
}
struct pager_create_result {
uintptr_t handle;
int maxprot;
uint32_t flags;
size_t size;
};
enum {
/* for memobj.flags */
MF_HAS_PAGER = 0x0001,
MF_SHMDT_OK = 0x0002,
MF_IS_REMOVABLE = 0x0004,
MF_PREFETCH = 0x0008,
MF_ZEROFILL = 0x0010,
MF_REG_FILE = 0x1000,
MF_DEV_FILE = 0x2000,
MF_PREMAP = 0x8000,
MF_END
};
static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
@@ -760,6 +1074,7 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
struct pager *newpager = NULL;
uintptr_t phys;
struct kstat st;
int mf_flags = 0;
dprintk("pager_req_create(%d,%lx)\n", fd, (long)result_pa);
@@ -827,6 +1142,31 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
list_add(&newpager->list, &pager_list);
pager = newpager;
newpager = NULL;
/* Intel MPI library and shared memory "prefetch" */
{
char *pathbuf, *fullpath;
pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
if (pathbuf) {
fullpath = d_path(&file->f_path, pathbuf, PATH_MAX);
if (!IS_ERR(fullpath)) {
if (!strncmp("/dev/shm/Intel_MPI", fullpath, 18)) {
mf_flags = (MF_PREMAP | MF_ZEROFILL);
dprintk("%s: filename: %s, premap & zerofill\n",
__FUNCTION__, fullpath);
}
else if (strstr(fullpath, "libmpi") != NULL) {
mf_flags = MF_PREFETCH;
dprintk("%s: filename: %s, prefetch\n",
__FUNCTION__, fullpath);
}
}
kfree(pathbuf);
}
}
break;
}
@@ -854,8 +1194,17 @@ found:
phys = ihk_device_map_memory(dev, result_pa, sizeof(*resp));
resp = ihk_device_map_virtual(dev, phys, sizeof(*resp), NULL, 0);
if (!resp) {
printk("%s: ERROR: invalid response structure address\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
resp->handle = (uintptr_t)pager;
resp->maxprot = maxprot;
resp->flags = mf_flags;
resp->size = st.size;
ihk_device_unmap_virtual(dev, resp, sizeof(*resp));
ihk_device_unmap_memory(dev, phys, sizeof(*resp));
@@ -958,6 +1307,13 @@ static int pager_req_read(ihk_os_t os, uintptr_t handle, off_t off, size_t size,
phys = ihk_device_map_memory(dev, rpa, size);
buf = ihk_device_map_virtual(dev, phys, size, NULL, 0);
if (!buf) {
printk("%s: ERROR: invalid buffer address\n",
__FUNCTION__);
ss = -EINVAL;
goto out;
}
fs = get_fs();
set_fs(KERNEL_DS);
pos = off;
@@ -1040,6 +1396,13 @@ static int pager_req_write(ihk_os_t os, uintptr_t handle, off_t off, size_t size
phys = ihk_device_map_memory(dev, rpa, size);
buf = ihk_device_map_virtual(dev, phys, size, NULL, 0);
if (!buf) {
printk("%s: ERROR: invalid buffer address\n",
__FUNCTION__);
ss = -EINVAL;
goto out;
}
fs = get_fs();
set_fs(KERNEL_DS);
pos = off;
@@ -1086,8 +1449,18 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off,
struct pager *pager = NULL;
struct pager_map_result *resp;
uintptr_t phys;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_per_proc_data *ppd = NULL;
dprintk("pager_req_map(%p,%d,%lx,%lx,%lx)\n", os, fd, len, off, result_rpa);
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (unlikely(!ppd)) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return -1;
}
pager = kzalloc(sizeof(*pager), GFP_ATOMIC);
if (!pager) {
error = -ENOMEM;
@@ -1148,13 +1521,29 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off,
phys = ihk_device_map_memory(dev, result_rpa, sizeof(*resp));
resp = ihk_device_map_virtual(dev, phys, sizeof(*resp), NULL, 0);
if (!resp) {
printk("%s: ERROR: invalid response structure address\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
resp->handle = (uintptr_t)pager;
resp->maxprot = maxprot;
ihk_device_unmap_virtual(dev, resp, sizeof(*resp));
ihk_device_unmap_memory(dev, phys, sizeof(*resp));
error = down_interruptible(&ppd->devobj_pager_lock);
if (error) {
error = -EINTR;
goto out;
}
list_add_tail(&pager->list, &ppd->devobj_pager_list);
up(&ppd->devobj_pager_lock);
pager = 0;
error = 0;
pager = 0; /* pager should be in list? */
out:
if (file) {
@@ -1163,6 +1552,7 @@ out:
if (pager) {
kfree(pager);
}
mcctrl_put_per_proc_data(ppd);
dprintk("pager_req_map(%p,%d,%lx,%lx,%lx): %d\n", os, fd, len, off, result_rpa, error);
return error;
}
@@ -1253,6 +1643,13 @@ out_release:
phys = ihk_device_map_memory(dev, ppfn_rpa, sizeof(*ppfn));
ppfn = ihk_device_map_virtual(dev, phys, sizeof(*ppfn), NULL, 0);
if (!ppfn) {
printk("%s: ERROR: invalid PFN address\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
*ppfn = pfn;
ihk_device_unmap_virtual(dev, ppfn, sizeof(*ppfn));
ihk_device_unmap_memory(dev, phys, sizeof(*ppfn));
@@ -1263,13 +1660,10 @@ out:
return error;
}
static int pager_req_unmap(ihk_os_t os, uintptr_t handle)
static int __pager_unmap(struct pager *pager)
{
struct pager * const pager = (void *)handle;
int error;
dprintk("pager_req_unmap(%p,%lx)\n", os, handle);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
down_write(&current->mm->mmap_sem);
error = do_munmap(current->mm, pager->map_uaddr, pager->map_len);
@@ -1279,12 +1673,42 @@ static int pager_req_unmap(ihk_os_t os, uintptr_t handle)
#endif
if (error) {
printk("pager_req_unmap(%p,%lx):do_munmap failed. %d\n", os, handle, error);
/* through */
printk("%s: WARNING: munmap failed for pager 0x%lx: %d\n",
__FUNCTION__, (uintptr_t)pager, error);
}
return error;
}
static int pager_req_unmap(ihk_os_t os, uintptr_t handle)
{
struct pager * const pager = (void *)handle;
int error;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_per_proc_data *ppd = NULL;
dprintk("pager_req_unmap(%p,%lx)\n", os, handle);
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (unlikely(!ppd)) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return -1;
}
error = down_interruptible(&ppd->devobj_pager_lock);
if (error) {
error = -EINTR;
goto out;
}
list_del(&pager->list);
up(&ppd->devobj_pager_lock);
error = __pager_unmap(pager);
kfree(pager);
dprintk("pager_req_unmap(%p,%lx): %d\n", os, handle, error);
out:
return error;
}
@@ -1351,6 +1775,12 @@ void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
res = ihk_device_map_virtual(ihk_os_to_dev(os),
phys, sizeof(*res), NULL, 0);
if (!res) {
printk("%s: ERROR: invalid response structure address\n",
__FUNCTION__);
return;
}
/* Map response structure and notify offloading thread */
res->ret = ret;
res->stid = stid;
@@ -1582,6 +2012,7 @@ int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet)
dprintk("%s: pid: %d, rpgtable: 0x%lx updated\n",
__FUNCTION__, ppd->pid, ppd->rpgtable);
mcctrl_put_per_proc_data(ppd);
}
ret = clear_pte_range(sc->args[0], sc->args[1]);

View File

@@ -18,7 +18,7 @@
#include "mcctrl.h"
#include "sysfs_msg.h"
#define dprintk(...) do { if (0) printk(KERN_DEBUG __VA_ARGS__); } while (0)
#define dprintk(...) do { if (0) printk(__VA_ARGS__); } while (0)
#define wprintk(...) do { if (1) printk(KERN_WARNING __VA_ARGS__); } while (0)
#define eprintk(...) do { if (1) printk(KERN_ERR __VA_ARGS__); } while (0)
@@ -278,8 +278,10 @@ release_i(struct sysfsm_node *np)
sdp = np->sdp;
if (np->server_ops && np->server_ops->release) {
(*np->server_ops->release)(np->server_ops, np);
if (np->type != SNT_DIR) {
if (np->server_ops && np->server_ops->release) {
(*np->server_ops->release)(np->server_ops, np);
}
}
kfree(np->name);
kfree(np);
@@ -719,8 +721,6 @@ unlink_i(struct sysfsm_node *np)
else if (np->type == SNT_DIR) {
if (np->parent != np) {
kobject_del(&np->kobj);
error = 0;
goto out;
}
}
else if (np->type == SNT_LINK) {
@@ -1232,9 +1232,16 @@ sysfsm_cleanup(ihk_os_t os)
int error;
ihk_device_t dev = ihk_os_to_dev(os);
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
struct sysfsm_data *sdp = &udp->sysfsm_data;
struct sysfsm_data *sdp;
struct sysfsm_node *np;
if (!udp) {
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
return;
}
sdp = &udp->sysfsm_data;
dprintk("mcctrl:sysfsm_cleanup(%p)\n", os);
if (sdp->sysfs_buf) {
@@ -2095,9 +2102,16 @@ struct sysfsm_ops snooping_local_ops_s = {
/**** local list ****/
static ssize_t snooping_local_show_pbl(struct sysfsm_ops *ops, void *instance, void *buf, size_t bufsize)
{
size_t ret;
const struct sysfsm_bitmap_param *p = instance;
return bitmap_scnlistprintf(buf, bufsize, p->ptr, p->nbits);
ret = bitmap_scnlistprintf(buf, bufsize, p->ptr, p->nbits);
if (ret < bufsize - 1) {
sprintf(buf + ret, "\n");
return ret + 1;
}
return 0;
} /* snooping_local_show_pbl() */
struct sysfsm_ops snooping_local_ops_pbl = {
@@ -2108,9 +2122,16 @@ struct sysfsm_ops snooping_local_ops_pbl = {
/**** local map ****/
static ssize_t snooping_local_show_pb(struct sysfsm_ops *ops, void *instance, void *buf, size_t bufsize)
{
size_t ret;
const struct sysfsm_bitmap_param *p = instance;
return bitmap_scnprintf(buf, bufsize, p->ptr, p->nbits);
ret = bitmap_scnprintf(buf, bufsize, p->ptr, p->nbits);
if (ret < bufsize - 1) {
sprintf(buf + ret, "\n");
return ret + 1;
}
return 0;
} /* snooping_local_show_pb() */
struct sysfsm_ops snooping_local_ops_pb = {

View File

@@ -14,11 +14,11 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/version.h>
#include "../../config.h"
#include "../../../config.h"
#include "mcctrl.h"
#include "sysfs_msg.h"
#define dprintk(...) do { if (0) printk(KERN_DEBUG __VA_ARGS__); } while (0)
#define dprintk(...) do { if (0) printk(__VA_ARGS__); } while (0)
#define wprintk(...) do { if (1) printk(KERN_WARNING __VA_ARGS__); } while (0)
#define eprintk(...) do { if (1) printk(KERN_ERR __VA_ARGS__); } while (0)
@@ -92,27 +92,19 @@ void setup_local_snooping_samples(ihk_os_t os)
void setup_local_snooping_files(ihk_os_t os)
{
struct ihk_cpu_info *info;
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
struct sysfsm_bitmap_param param;
static unsigned long cpu_offline = 0x0;
int i;
int error;
info = ihk_os_get_cpu_info(os);
if (!info) {
eprintk("mcctrl:ihk_os_get_cpu_info failed.\n");
return;
}
memset(udp->cpu_online, 0, sizeof(udp->cpu_online));
for (i = 0; i < info->n_cpus; i++) {
udp->cpu_online[i / BITS_PER_LONG] =
udp->cpu_online[i / BITS_PER_LONG] | (1 << (i % BITS_PER_LONG));
for (i = 0; i < udp->cpu_info->n_cpus; i++) {
set_bit(i, udp->cpu_online);
}
param.nbits = CPU_LONGS * BITS_PER_LONG;
param.ptr = udp->cpu_online;
param.ptr = &udp->cpu_online;
dprintk("mcctrl:setup_local_snooping_files: CPU_LONGS=%d, BITS_PER_LONG=%d\n",
CPU_LONGS, BITS_PER_LONG);
@@ -187,141 +179,122 @@ static void free_cpu_topology(struct mcctrl_usrdata *udp)
return;
} /* free_cpu_topology() */
static void free_cpu_mapping(struct mcctrl_usrdata *udp)
{
ihk_device_t dev = ihk_os_to_dev(udp->os);
size_t size;
size = udp->cpu_mapping_elems * sizeof(struct cpu_mapping);
ihk_device_unmap_virtual(dev, udp->cpu_mapping, size);
ihk_device_unmap_memory(dev, udp->cpu_mapping_pa, size);
return;
} /* free_cpu_mapping() */
void free_topology_info(ihk_os_t os)
{
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
if (!udp) {
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
return;
}
free_node_topology(udp);
free_cpu_topology(udp);
free_cpu_mapping(udp);
return;
} /* free_topology_info() */
void reply_get_cpu_mapping(long req_pa)
/*
* CPU and NUMA node mapping conversion functions.
*/
int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id)
{
struct get_cpu_mapping_req *req = phys_to_virt(req_pa);
return (cpu_id < udp->cpu_info->n_cpus) ?
udp->cpu_info->mapping[cpu_id] : -1;
}
req->busy = 0;
wake_up(&req->wq);
return;
} /* reply_get_cpu_mapping() */
static int get_cpu_mapping(struct mcctrl_usrdata *udp)
int mckernel_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu_id)
{
int error;
ihk_device_t dev = ihk_os_to_dev(udp->os);
struct get_cpu_mapping_req *req = NULL;
struct ikc_scd_packet packet;
size_t size;
return (cpu_id < udp->cpu_info->n_cpus) ?
udp->cpu_info->hw_ids[cpu_id] : -1;
}
dprintk("get_cpu_mapping(%p)\n", udp);
req = kmalloc(sizeof(*req), GFP_KERNEL);
if (!req) {
error = -ENOMEM;
eprintk("mcctrl:get_cpu_mapping:kmalloc failed. %d\n", error);
goto out;
}
req->busy = 1;
req->error = -1;
init_waitqueue_head(&req->wq);
packet.msg = SCD_MSG_GET_CPU_MAPPING;
packet.arg = virt_to_phys(req);
#define GET_CPU_MAPPING_CPU 0
error = mcctrl_ikc_send(udp->os, GET_CPU_MAPPING_CPU, &packet);
if (error) {
eprintk("mcctrl:get_cpu_mapping:"
"mcctrl_ikc_send failed. %d\n", error);
goto out;
}
error = wait_event_interruptible(req->wq, !req->busy);
if (error) {
eprintk("mcctrl:get_cpu_mapping:"
"wait_event_interruptible failed. %d\n", error);
req = NULL; /* XXX */
goto out;
}
if (req->error) {
error = req->error;
eprintk("mcctrl:get_cpu_mapping:"
"SCD_MSG_GET_CPU_MAPPING failed. %d\n", error);
goto out;
}
size = req->buf_elems * sizeof(struct cpu_mapping);
udp->cpu_mapping_elems = req->buf_elems;
udp->cpu_mapping_pa = ihk_device_map_memory(dev, req->buf_rpa, size);
udp->cpu_mapping = ihk_device_map_virtual(
dev, udp->cpu_mapping_pa, size, NULL, 0);
error = 0;
out:
dprintk("get_cpu_mapping(%p): %d\n", udp, error);
kfree(req);
return error;
} /* get_cpu_mapping() */
static int hwid_to_cpu(struct mcctrl_usrdata *udp, int hw_id)
int linux_cpu_2_mckernel_cpu(struct mcctrl_usrdata *udp, int cpu_id)
{
int i;
for (i = 0; i < udp->cpu_mapping_elems; ++i) {
if (udp->cpu_mapping[i].hw_id == hw_id) {
return udp->cpu_mapping[i].cpu_number;
for (i = 0; i < udp->cpu_info->n_cpus; ++i) {
if (udp->cpu_info->mapping[i] == cpu_id)
return i;
}
return -1;
}
#if 0
int hw_id_2_mckernel_cpu(struct mcctrl_usrdata *udp, int hw_id)
{
int i;
for (i = 0; i < udp->cpu_info->n_cpus; ++i) {
if (udp->cpu_info->hw_ids[i] == hw_id) {
return i;
}
}
return -1;
}
int hw_id_2_linux_cpu(struct mcctrl_usrdata *udp, int hw_id)
{
int i;
for (i = 0; i < udp->cpu_info->n_cpus; ++i) {
if (udp->cpu_info->hw_ids[i] == hw_id) {
return mckernel_cpu_2_linux_cpu(udp, i);
}
}
return -1;
}
int linux_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu)
{
int mckernel_cpu = linux_cpu_2_mckernel_cpu(udp, cpu);
return (mckernel_cpu >= 0 && mckernel_cpu < udp->cpu_info->n_cpus) ?
udp->cpu_info->hw_ids[mckernel_cpu] : -1;
}
#endif
int mckernel_numa_2_linux_numa(struct mcctrl_usrdata *udp, int numa_id)
{
return (numa_id < udp->mem_info->n_numa_nodes) ?
udp->mem_info->numa_mapping[numa_id] : -1;
}
int linux_numa_2_mckernel_numa(struct mcctrl_usrdata *udp, int numa_id)
{
int i;
for (i = 0; i < udp->mem_info->n_numa_nodes; ++i) {
if (udp->mem_info->numa_mapping[i] == numa_id)
return i;
}
return -1;
}
static int translate_cpumap(struct mcctrl_usrdata *udp,
cpumask_t *linmap, cpumask_t *mckmap)
{
int error;
ihk_device_t dev = ihk_os_to_dev(udp->os);
int lincpu;
int hw_id;
int mckcpu;
dprintk("translate_cpumap(%p,%p,%p)\n", udp, linmap, mckmap);
cpumask_clear(mckmap);
for_each_cpu(lincpu, linmap) {
hw_id = ihk_device_linux_cpu_to_hw_id(dev, lincpu);
if (hw_id < 0) {
error = hw_id;
eprintk("mcctrl:translate_cpumap:"
"ihk_device_linux_cpu_to_hw_id failed."
" %d\n", error);
goto out;
}
mckcpu = linux_cpu_2_mckernel_cpu(udp, lincpu);
mckcpu = hwid_to_cpu(udp, hw_id);
if (mckcpu >= 0) {
cpumask_set_cpu(mckcpu, mckmap);
}
}
error = 0;
out:
dprintk("translate_cpumap(%p,%p,%p): %d\n", udp, linmap, mckmap, error);
return error;
} /* translate_cpumap() */
@@ -361,7 +334,7 @@ out:
return (error)? ERR_PTR(error): topo;
} /* get_cache_topology() */
static struct cpu_topology *get_cpu_topology_one(struct mcctrl_usrdata *udp,
static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
int index)
{
int error;
@@ -370,41 +343,43 @@ static struct cpu_topology *get_cpu_topology_one(struct mcctrl_usrdata *udp,
struct cache_topology *cache;
struct ihk_cache_topology *saved_cache;
dprintk("get_cpu_topology_one(%p,%d)\n", udp, index);
dprintk("get_one_cpu_topology(%p,%d)\n", udp, index);
topology = kmalloc(sizeof(*topology), GFP_KERNEL);
if (!topology) {
error = -ENOMEM;
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"kmalloc failed. %d\n", error);
goto out;
}
INIT_LIST_HEAD(&topology->cache_list);
topology->cpu_mapping = &udp->cpu_mapping[index];
topology->mckernel_cpu_id = index;
topology->saved = ihk_device_get_cpu_topology(dev,
mckernel_cpu_2_hw_id(udp, index));
topology->saved = ihk_device_get_cpu_topology(
dev, topology->cpu_mapping->hw_id);
if (IS_ERR(topology->saved)) {
error = PTR_ERR(topology->saved);
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"ihk_device_get_cpu_topology failed. %d\n",
error);
goto out;
}
error = translate_cpumap(udp, &topology->saved->core_siblings,
error = translate_cpumap(udp,
&topology->saved->core_siblings,
&topology->core_siblings);
if (error) {
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"translate_cpumap(core_siblings) failed."
" %d\n", error);
goto out;
}
error = translate_cpumap(udp, &topology->saved->thread_siblings,
error = translate_cpumap(udp,
&topology->saved->thread_siblings,
&topology->thread_siblings);
if (error) {
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"translate_cpumap(thread_siblings) failed."
" %d\n", error);
goto out;
@@ -415,7 +390,7 @@ static struct cpu_topology *get_cpu_topology_one(struct mcctrl_usrdata *udp,
cache = get_cache_topology(udp, topology, saved_cache);
if (IS_ERR(cache)) {
error = PTR_ERR(cache);
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"get_cache_topology failed. %d\n",
error);
goto out;
@@ -429,10 +404,10 @@ out:
if (error && !IS_ERR_OR_NULL(topology)) {
free_cpu_topology_one(udp, topology);
}
dprintk("get_cpu_topology_one(%p,%d): %d %p\n",
dprintk("get_one_cpu_topology(%p,%d): %d %p\n",
udp, index, error, topology);
return (error)? ERR_PTR(error): topology;
} /* get_cpu_topology_one() */
} /* get_one_cpu_topology() */
static int get_cpu_topology(struct mcctrl_usrdata *udp)
{
@@ -441,12 +416,12 @@ static int get_cpu_topology(struct mcctrl_usrdata *udp)
struct cpu_topology *topology;
dprintk("get_cpu_topology(%p)\n", udp);
for (index = 0; index < udp->cpu_mapping_elems; ++index) {
topology = get_cpu_topology_one(udp, index);
for (index = 0; index < udp->cpu_info->n_cpus; ++index) {
topology = get_one_cpu_topology(udp, index);
if (IS_ERR(topology)) {
error = PTR_ERR(topology);
eprintk("mcctrl:get_cpu_topology:"
"get_cpu_topology_one failed. %d\n",
eprintk("mcctrl:get_cpu_topology: "
"get_one_cpu_topology failed. %d\n",
error);
goto out;
}
@@ -460,15 +435,15 @@ out:
return error;
} /* get_cpu_topology() */
static void setup_one_cache_files(struct mcctrl_usrdata *udp,
static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
struct cpu_topology *cpu, struct cache_topology *cache)
{
char *prefix = "/sys/devices/system/cpu";
int cpu_number = cpu->cpu_mapping->cpu_number;
int cpu_number = cpu->mckernel_cpu_id;
int index = cache->saved->index;
struct sysfsm_bitmap_param param;
dprintk("setup_one_cache_files(%p,%p,%p)\n", udp, cpu, cache);
dprintk("setup_cpu_sysfs_cache_files(%p,%p,%p)\n", udp, cpu, cache);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_d64,
&cache->saved->level, 0444,
@@ -509,19 +484,19 @@ static void setup_one_cache_files(struct mcctrl_usrdata *udp,
"%s/cpu%d/cache/index%d/shared_cpu_list",
prefix, cpu_number, index);
dprintk("setup_one_cache_files(%p,%p,%p):\n", udp, cpu, cache);
dprintk("setup_cpu_sysfs_cache_files(%p,%p,%p):\n", udp, cpu, cache);
return;
} /* setup_one_cache_files() */
} /* setup_cpu_sysfs_cache_files() */
static void setup_one_cpu_files(struct mcctrl_usrdata *udp,
static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
struct cpu_topology *cpu)
{
char *prefix = "/sys/devices/system/cpu";
int cpu_number = cpu->cpu_mapping->cpu_number;
int cpu_number = cpu->mckernel_cpu_id;
struct sysfsm_bitmap_param param;
struct cache_topology *cache;
dprintk("setup_one_cpu_files(%p,%p)\n", udp, cpu);
dprintk("setup_cpu_sysfs_files(%p,%p)\n", udp, cpu);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_d32,
&cpu->saved->physical_package_id, 0444,
@@ -553,41 +528,61 @@ static void setup_one_cpu_files(struct mcctrl_usrdata *udp,
prefix, cpu_number);
list_for_each_entry(cache, &cpu->cache_list, chain) {
setup_one_cache_files(udp, cpu, cache);
setup_cpu_sysfs_cache_files(udp, cpu, cache);
}
dprintk("setup_one_cpu_files(%p,%p):\n", udp, cpu);
dprintk("setup_cpu_sysfs_files(%p,%p):\n", udp, cpu);
return;
} /* setup_one_cpu_files() */
} /* setup_cpu_sysfs_files() */
static void setup_cpu_files(struct mcctrl_usrdata *udp)
static void setup_cpus_sysfs_files_node_link(struct mcctrl_usrdata *udp)
{
int error;
int cpu;
struct sysfs_handle handle;
for (cpu = 0; cpu < udp->cpu_info->n_cpus; ++cpu) {
int node = linux_numa_2_mckernel_numa(udp,
cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu)));
error = sysfsm_lookupf(udp->os, &handle,
"/sys/devices/system/node/node%d", node);
if (error) {
panic("sysfsm_lookupf: node for CPU");
}
error = sysfsm_symlinkf(udp->os, handle,
"/sys/devices/system/cpu/cpu%d/node%d",
cpu, node);
if (error) {
panic("sysfsm_symlinkf(CPU in node)");
}
}
error = 0;
return;
}
static void setup_cpus_sysfs_files(struct mcctrl_usrdata *udp)
{
int error;
struct cpu_topology *cpu;
dprintk("setup_cpu_file(%p)\n", udp);
error = get_cpu_mapping(udp);
if (error) {
eprintk("mcctrl:setup_cpu_files:"
"get_cpu_mapping failed. %d\n", error);
goto out;
}
error = get_cpu_topology(udp);
if (error) {
eprintk("mcctrl:setup_cpu_files:"
eprintk("mcctrl:setup_cpus_sysfs_files:"
"get_cpu_topology failed. %d\n", error);
goto out;
}
list_for_each_entry(cpu, &udp->cpu_topology_list, chain) {
setup_one_cpu_files(udp, cpu);
setup_cpu_sysfs_files(udp, cpu);
}
error = 0;
out:
dprintk("setup_cpu_file(%p):\n", udp);
return;
} /* setup_cpu_files() */
} /* setup_cpus_sysfs_files() */
static struct node_topology *get_one_node_topology(struct mcctrl_usrdata *udp,
struct ihk_node_topology *saved)
@@ -629,8 +624,10 @@ static int get_node_topology(struct mcctrl_usrdata *udp)
struct node_topology *topology;
dprintk("get_node_topology(%p)\n", udp);
for (node = 0; ; ++node) {
saved = ihk_device_get_node_topology(dev, node);
for (node = 0; node < udp->mem_info->n_numa_nodes; ++node) {
saved = ihk_device_get_node_topology(dev,
mckernel_numa_2_linux_numa(udp, node));
if (IS_ERR(saved)) {
break;
}
@@ -647,6 +644,8 @@ static int get_node_topology(struct mcctrl_usrdata *udp)
goto out;
}
topology->mckernel_numa_id = node;
list_add(&topology->chain, &udp->node_topology_list);
}
@@ -659,6 +658,7 @@ out:
static int setup_node_files(struct mcctrl_usrdata *udp)
{
int error;
int node;
struct node_topology *p;
struct sysfsm_bitmap_param param;
@@ -670,16 +670,71 @@ static int setup_node_files(struct mcctrl_usrdata *udp)
goto out;
}
memset(&udp->numa_online, 0, sizeof(udp->numa_online));
for (node = 0; node < udp->mem_info->n_numa_nodes; ++node) {
node_set(node, udp->numa_online);
}
param.nbits = MAX_NUMNODES;
param.ptr = &udp->numa_online;
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pbl, &param, 0444,
"/sys/devices/system/node/online");
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pbl, &param, 0444,
"/sys/devices/system/node/possible");
list_for_each_entry(p, &udp->node_topology_list, chain) {
struct sysfs_handle handle;
int cpu;
size_t offset = 0;
param.nbits = nr_cpumask_bits;
param.ptr = &p->cpumap;
for (node = 0; node < udp->mem_info->n_numa_nodes; ++node) {
if (node > 0) {
offset += snprintf(&p->mckernel_numa_distance_s[offset],
NODE_DISTANCE_S_SIZE - offset, "%s", " ");
}
offset += snprintf(&p->mckernel_numa_distance_s[offset],
NODE_DISTANCE_S_SIZE - offset, "%d",
node_distance(
mckernel_numa_2_linux_numa(udp, p->mckernel_numa_id),
mckernel_numa_2_linux_numa(udp, node)
));
}
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_s,
p->mckernel_numa_distance_s, 0444,
"/sys/devices/system/node/node%d/distance",
p->mckernel_numa_id);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pb, &param, 0444,
"/sys/devices/system/node/node%d/cpumap",
p->saved->node_number);
p->mckernel_numa_id);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pbl, &param, 0444,
"/sys/devices/system/node/node%d/cpulist",
p->saved->node_number);
p->mckernel_numa_id);
/* Add CPU symlinks for this node */
for (cpu = 0; cpu < udp->cpu_info->n_cpus; ++cpu) {
if (linux_numa_2_mckernel_numa(udp,
cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu)))
!= p->mckernel_numa_id) {
continue;
}
error = sysfsm_lookupf(udp->os, &handle,
"/sys/devices/system/cpu/cpu%d", cpu);
if (error) {
panic("sysfsm_lookupf(CPU in node)");
}
error = sysfsm_symlinkf(udp->os, handle,
"/sys/devices/system/node/node%d/cpu%d",
p->mckernel_numa_id, cpu);
if (error) {
panic("sysfsm_symlinkf(CPU in node)");
}
}
}
error = 0;
@@ -1026,11 +1081,18 @@ void setup_sysfs_files(ihk_os_t os)
panic("sysfsm_unlinkf");
}
setup_local_snooping_samples(os);
//setup_local_snooping_samples(os);
setup_local_snooping_files(os);
setup_cpu_files(udp);
setup_cpus_sysfs_files(udp);
setup_node_files(udp);
setup_pci_files(udp);
setup_cpus_sysfs_files_node_link(udp);
//setup_pci_files(udp);
/* Indicate sysfs files setup completion for boot script */
error = sysfsm_mkdirf(os, NULL, "/sys/setup_complete");
if (error) {
panic("sysfsm_mkdir(complete)");
}
return;
} /* setup_files() */

View File

@@ -1,6 +1,6 @@
ENABLE_MCOVERLAYFS=@ENABLE_MCOVERLAYFS@
RELEASE=@UNAME_R@
RELEASE=$(shell uname -r)
MAJOR=$(shell echo ${RELEASE} | sed -e 's/^\([0-9]*\).*/\1/')
MINOR=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/')
PATCH=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/')
@@ -9,15 +9,19 @@ RHEL_RELEASE_TMP=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-
RHEL_RELEASE=$(shell if [ "${RELEASE}" == "${RHEL_RELEASE_TMP}" ]; then echo ""; else echo ${RHEL_RELEASE_TMP}; fi)
BUILD_MODULE_TMP=$(shell if [ "${RHEL_RELEASE}" == "" ]; then echo "org"; else echo "rhel"; fi)
BUILD_MODULE=none
#$(info "LINUX_VERSION_CODE: ${LINUX_VERSION_CODE}, RHEL_RELEASE: ${RHEL_RELEASE}")
ifeq ($(ENABLE_MCOVERLAYFS),yes)
ifeq ($(BUILD_MODULE_TMP),org)
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 262144 -a ${LINUX_VERSION_CODE} -lt 262400 ]; then echo "linux-4.0.9"; else echo "none"; fi)
endif
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 243680 -a ${LINUX_VERSION_CODE} -lt 263936 ]; then echo "linux-4.6.7"; else echo "none"; fi)
endif
endif
ifeq ($(BUILD_MODULE_TMP),rhel)
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -eq 199168 -a ${RHEL_RELEASE} -eq 327 ]; then echo "linux-3.10.0-327.36.1.el7"; else echo "none"; fi)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -eq 199168 -a ${RHEL_RELEASE} -ge 327 -a ${RHEL_RELEASE} -le 514 ]; then echo "linux-3.10.0-327.36.1.el7"; else echo "none"; fi)
endif
endif
endif
@@ -32,6 +36,7 @@ endif
clean:
@(cd linux-3.10.0-327.36.1.el7; make clean)
@(cd linux-4.0.9; make clean)
@(cd linux-4.6.7; make clean)
install:
ifneq ($(BUILD_MODULE),none)

View File

@@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@@ -0,0 +1,460 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
static bool __read_mostly ovl_check_copy_up;
module_param_named(check_copy_up, ovl_check_copy_up, bool,
S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(ovl_check_copy_up,
"Warn on copy-up when causing process also has a R/O fd open");
static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
{
const struct dentry *dentry = data;
if (f->f_inode == d_inode(dentry))
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
f, fd, current->pid, current->comm);
return 0;
}
/*
* Check the fds open by this process and warn if something like the following
* scenario is about to occur:
*
* fd1 = open("foo", O_RDONLY);
* fd2 = open("foo", O_RDWR);
*/
static void ovl_do_check_copy_up(struct dentry *dentry)
{
if (ovl_check_copy_up)
iterate_fd(current->files, 0, ovl_check_fd, dentry);
}
int ovl_copy_xattr(struct dentry *old, struct dentry *new, unsigned opt)
{
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
int uninitialized_var(error);
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
retry:
size = vfs_getxattr(old, name, value, value_size);
if (size == -ERANGE)
size = vfs_getxattr(old, name, NULL, 0);
if (size < 0) {
if (OVL_OPT_NOFSCHECK(opt)) {
OVL_DEBUG("fail: old=%pd4, i_ino=%lu, name=%s\n",
old, old->d_inode->i_ino, name);
continue;
} else {
error = size;
break;
}
}
OVL_DEBUG("success: old=%pd4, i_ino=%lu, name=%s\n",
old, old->d_inode->i_ino, name);
if (size > value_size) {
void *new;
new = krealloc(value, size, GFP_KERNEL);
if (!new) {
error = -ENOMEM;
break;
}
value = new;
value_size = size;
goto retry;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
break;
}
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
unsigned opt = ovl_get_config_opt(dentry);
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry, opt);
if (err)
goto out_cleanup;
inode_lock(newdentry->d_inode);
err = ovl_set_attr(newdentry, stat);
inode_unlock(newdentry->d_inode);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out2;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_do_check_copy_up(lowerpath->dentry);
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
/* Raced with another copy-up? Nothing to do, then... */
err = 0;
goto out_unlock;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
dput(parent);
dput(next);
}
return err;
}

View File

@@ -0,0 +1,969 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (d_is_dir(wdentry))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
inode_lock_nested(udir, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
inode_unlock(udir);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
unsigned opt = ovl_get_config_opt(dentry);
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir, opt);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
inode_lock(opaquedir->d_inode);
err = ovl_set_attr(opaquedir, &stat);
inode_unlock(opaquedir->d_inode);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
int flags = 0;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if ((opaquedir && upper != opaquedir) ||
(!opaquedir && ovl_dentry_upper(dentry) &&
upper != ovl_dentry_upper(dentry))) {
goto out_dput_upper;
}
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_dput_upper;
if (d_is_dir(upper))
flags = RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (flags)
ovl_cleanup(wdir, upper);
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_dput_upper:
dput(upper);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper;
int err;
inode_lock_nested(dir, I_MUTEX_PARENT);
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if (upper == ovl_dentry_upper(dentry)) {
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
ovl_dentry_version_inc(dentry->d_parent);
}
dput(upper);
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
if (!err)
d_drop(dentry);
out_unlock:
inode_unlock(dir);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = d_is_dir(old);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (d_is_dir(new))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
old->d_name.len);
err = PTR_ERR(olddentry);
if (IS_ERR(olddentry))
goto out_unlock;
err = -ESTALE;
if (olddentry != ovl_dentry_upper(old))
goto out_dput_old;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_dput_old;
err = -ESTALE;
if (ovl_dentry_upper(new)) {
if (opaquedir) {
if (newdentry != opaquedir)
goto out_dput;
} else {
if (newdentry != ovl_dentry_upper(new))
goto out_dput;
}
} else {
if (!d_is_negative(newdentry) &&
(!new_opaque || !ovl_is_whiteout(newdentry)))
goto out_dput;
}
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
/*
* Old dentry now lives in different location. Dentries in
* lowerstack are stale. We cannot drop them here because
* access to them is lockless. This could be only pure upper
* or opaque directory - numlower is zero. Or upper non-dir
* entry - its pureness is tracked by flag opaque.
*/
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_dput_old:
dput(olddentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
const struct inode_operations ovl_dir_inode_operations = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename2 = ovl_rename2,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};

View File

@@ -0,0 +1,494 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_truncate(struct dentry *dentry)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
unsigned opt = ovl_get_config_opt(dentry);
if (OVL_OPT_NOCOPYUPW(opt)) {
return 0;
}
/*
* Check for permissions before trying to copy-up. This is redundant
* since it will be rechecked later by ->setattr() on upper dentry. But
* without this, copy-up can be triggered by just about anybody.
*
* We don't initialize inode->size, which just means that
* inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not
* check for a swapfile (which this won't be anyway).
*/
err = inode_change_ok(dentry->d_inode, attr);
if (err)
return err;
err = ovl_want_write(dentry);
if (err)
goto out;
if (attr->ia_valid & ATTR_SIZE) {
struct inode *realinode = d_inode(ovl_dentry_real(dentry));
err = -ETXTBSY;
if (atomic_read(&realinode->i_writecount) < 0)
goto out_drop_write;
}
err = ovl_copy_up(dentry);
if (!err) {
struct inode *winode = NULL;
upperdentry = ovl_dentry_upper(dentry);
if (attr->ia_valid & ATTR_SIZE) {
winode = d_inode(upperdentry);
err = get_write_access(winode);
if (err)
goto out_drop_write;
}
if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
attr->ia_valid &= ~ATTR_MODE;
inode_lock(upperdentry->d_inode);
err = notify_change(upperdentry, attr, NULL);
if (!err)
ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
inode_unlock(upperdentry->d_inode);
if (winode)
put_write_access(winode);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
ovl_reset_ovl_entry(&oe, alias);
}
realdentry = ovl_entry_real(oe, &is_upper);
if (ovl_is_default_permissions(inode)) {
struct kstat stat;
struct path realpath = { .dentry = realdentry };
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
realpath.mnt = ovl_entry_mnt_real(oe, inode, is_upper);
err = vfs_getattr(&realpath, &stat);
if (err)
goto out_dput;
err = -ESTALE;
if ((stat.mode ^ inode->i_mode) & S_IFMT)
goto out_dput;
inode->i_mode = stat.mode;
inode->i_uid = stat.uid;
inode->i_gid = stat.gid;
err = generic_permission(inode, mask);
goto out_dput;
}
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
static const char *ovl_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
struct dentry *realdentry;
struct inode *realinode;
if (!dentry)
return ERR_PTR(-ECHILD);
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->get_link))
return ERR_PTR(-EPERM);
return realinode->i_op->get_link(realdentry, realinode, done);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
unsigned opt = ovl_get_config_opt(dentry);
if (OVL_OPT_NOCOPYUPW(opt)) {
return 0;
}
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
unsigned opt = ovl_get_config_opt(dentry);
if (OVL_OPT_NOCOPYUPW(opt)) {
return 0;
}
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
{
int err;
struct path realpath;
enum ovl_path_type type;
unsigned opt = ovl_get_config_opt(dentry);
if (d_is_dir(dentry))
return d_backing_inode(dentry);
type = ovl_path_real(dentry, &realpath);
if (!OVL_OPT_NOCOPYUPW(opt) &&
ovl_open_need_copy_up(file_flags, type, realpath.dentry)) {
OVL_DEBUG("copyup: realpath.dentry=%pd4, i_ino=%lu\n",
realpath.dentry, realpath.dentry->d_inode->i_ino);
err = ovl_want_write(dentry);
if (err)
return ERR_PTR(err);
if (file_flags & O_TRUNC)
err = ovl_copy_up_truncate(dentry);
else
err = ovl_copy_up(dentry);
ovl_drop_write(dentry);
if (err)
return ERR_PTR(err);
ovl_path_upper(dentry, &realpath);
}
if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE)
return realpath.dentry->d_op->d_select_inode(realpath.dentry, file_flags);
if (OVL_OPT_NOFSCHECK(opt)) {
if (realpath.dentry->d_inode->i_sb->s_magic == SYSFS_MAGIC) {
OVL_DEBUG("sysfs: dentry=%pd4, i_ino=%lu\n",
dentry, dentry->d_inode->i_ino);
OVL_DEBUG("sysfs: realpath.dentry=%pd4, i_ino=%lu\n",
realpath.dentry, realpath.dentry->d_inode->i_ino);
if (!ovl_find_d_fsdata(dentry)) {
ovl_add_d_fsdata(dentry);
dentry->d_fsdata = realpath.dentry->d_fsdata;
}
}
}
return d_backing_inode(realpath.dentry);
}
static const struct inode_operations ovl_file_inode_operations = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.get_link = ovl_get_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
mode &= S_IFMT;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations;
inode->i_fop = &ovl_dir_operations;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@@ -0,0 +1,230 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
//#define DEBUG
#ifdef DEBUG
#define OVL_DEBUG(format, ...) pr_err("[DEBUG] %s(): " format, __FUNCTION__, ##__VA_ARGS__)
#else
#define OVL_DEBUG(format, ...) {}
#endif
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
enum ovl_opt_bit {
__OVL_OPT_DEFAULT = 0,
__OVL_OPT_NOCOPYUPW = (1 << 0),
__OVL_OPT_NOFSCHECK = (1 << 1),
};
#define OVL_OPT_NOCOPYUPW(opt) ((opt) & __OVL_OPT_NOCOPYUPW)
#define OVL_OPT_NOFSCHECK(opt) ((opt) & __OVL_OPT_NOFSCHECK)
struct ovl_d_fsdata {
struct list_head list;
struct dentry *d;
struct ovl_entry *oe;
};
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
unsigned ovl_get_config_opt(struct dentry *dentry);
void ovl_reset_ovl_entry(struct ovl_entry **oe, struct dentry *dentry);
struct ovl_entry *ovl_find_d_fsdata(struct dentry *dentry);
int ovl_add_d_fsdata(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
bool is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
bool ovl_is_default_permissions(struct inode *inode);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
int ovl_check_d_type_supported(struct path *realpath);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
to->i_mode = from->i_mode;
}
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat);
int ovl_copy_xattr(struct dentry *old, struct dentry *new, unsigned opt);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@@ -0,0 +1,616 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
struct ovl_cache_entry *next_maybe_whiteout;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_lowest;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
int count;
int err;
bool d_type_supported;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p;
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_lowest)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
}
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
{
int err;
struct ovl_cache_entry *p;
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred)
return -ENOMEM;
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
err = mutex_lock_killable(&dir->d_inode->i_mutex);
if (!err) {
while (rdd->first_maybe_whiteout) {
p = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
dentry = lookup_one_len(p->name, dir, p->len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
}
inode_unlock(dir->d_inode);
}
revert_creds(old_cred);
put_cred(override_cred);
return err;
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->first_maybe_whiteout = NULL;
rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = iterate_dir(realfile, &rdd->ctx);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
if (!err && rdd->first_maybe_whiteout)
err = ovl_check_whiteouts(realpath->dentry, rdd);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_lowest = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_lowest = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_iterate(struct file *file, struct dir_context *ctx)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
if (!ctx->pos)
ovl_dir_reset(file);
if (od->is_real)
return iterate_dir(od->realfile, ctx);
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, ctx->pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
od->cursor = p->l_node.next;
ctx->pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
inode_lock(file_inode(file));
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
inode_unlock(file_inode(file));
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
inode_lock(inode);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
inode_unlock(inode);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
inode_unlock(inode);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
inode_lock(inode);
ovl_cache_put(od, file->f_path.dentry);
inode_unlock(inode);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.iterate = ovl_iterate,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
inode_lock_nested(upper->d_inode, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
if (dentry->d_inode)
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
inode_unlock(upper->d_inode);
}
static int ovl_check_d_type(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
/* Even if d_type is not supported, DT_DIR is returned for . and .. */
if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
return 0;
if (d_type != DT_UNKNOWN)
rdd->d_type_supported = true;
return 0;
}
/*
* Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
* if error is encountered.
*/
int ovl_check_d_type_supported(struct path *realpath)
{
int err;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_check_d_type,
.d_type_supported = false,
};
err = ovl_dir_read(realpath, &rdd);
if (err)
return err;
return rdd.d_type_supported;
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,27 +1,43 @@
CC=@CC@
BINDIR=@BINDIR@
prefix=@prefix@
exec_prefix=@exec_prefix@
LIBDIR=@libdir@
MCKERNEL_LIBDIR=@MCKERNEL_LIBDIR@
KDIR ?= @KDIR@
CFLAGS=-Wall -O -I.
CFLAGS=-Wall -O -I. -I$(VPATH)/arch/${ARCH}
VPATH=@abs_srcdir@
TARGET=mcexec
TARGET=mcexec libsched_yield
@uncomment_if_ENABLE_MEMDUMP@TARGET+=eclair
LIBS=@LIBS@
ARCH=@ARCH@
IHKDIR ?= $(VPATH)/../../../ihk/linux/include/
all: $(TARGET)
mcexec: mcexec.c
$(CC) -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -lrt -pthread -o $@ $^ $(EXTRA_OBJS)
mcexec: mcexec.c libmcexec.a
$(CC) -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -DLIBDIR=\"$(LIBDIR)\" -fPIE -pie -L. -lmcexec -lrt -lnuma -pthread -o $@ $^ $(EXTRA_OBJS)
eclair: eclair.c
$(CC) $(CFLAGS) -o $@ $^ $(LIBS)
$(CC) $(CFLAGS) -I${IHKDIR} -o $@ $^ $(LIBS)
clean:
libsched_yield: libsched_yield.c
$(CC) -shared -fPIC -Wl,-soname,sched_yield.so.1 -o libsched_yield.so.1.0.0 $^ -lc -ldl
libmcexec.a::
(cd arch/${ARCH}; make)
clean::
(cd arch/${ARCH}; make clean)
$(RM) $(TARGET) *.o
.PHONY: all clean install
install:
install::
(cd arch/${ARCH}; make install)
mkdir -p -m 755 $(BINDIR)
install -m 755 mcexec $(BINDIR)
mkdir -p -m 755 $(MCKERNEL_LIBDIR)
install -m 755 libsched_yield.so.1.0.0 $(MCKERNEL_LIBDIR)
@uncomment_if_ENABLE_MEMDUMP@install -m 755 eclair $(BINDIR)

View File

@@ -0,0 +1,23 @@
CC=@CC@
AR=ar
BINDIR=@BINDIR@
KDIR ?= @KDIR@
CFLAGS=-Wall -O -I.
VPATH=@abs_srcdir@
TARGET=../../libmcexec.a
LIBS=@LIBS@
all: $(TARGET)
../../libmcexec.a: archdep.o
$(AR) cr ../../libmcexec.a archdep.o
archdep.o: archdep.S
$(CC) -c -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -pthread $<
clean:
$(RM) $(TARGET) *.o
.PHONY: all clean install
install:

View File

@@ -0,0 +1,113 @@
#ifndef ARCH_ARGS_H
#define ARCH_ARGS_H
typedef struct user_regs_struct syscall_args;
static inline int
get_syscall_args(int pid, syscall_args *args)
{
return ptrace(PTRACE_GETREGS, pid, NULL, args);
}
static inline int
set_syscall_args(int pid, syscall_args *args)
{
return ptrace(PTRACE_SETREGS, pid, NULL, args);
}
static inline unsigned long
get_syscall_number(syscall_args *args)
{
return args->orig_rax;
}
static inline unsigned long
get_syscall_return(syscall_args *args)
{
return args->rax;
}
static inline unsigned long
get_syscall_arg1(syscall_args *args)
{
return args->rdi;
}
static inline unsigned long
get_syscall_arg2(syscall_args *args)
{
return args->rsi;
}
static inline unsigned long
get_syscall_arg3(syscall_args *args)
{
return args->rdx;
}
static inline unsigned long
get_syscall_arg4(syscall_args *args)
{
return args->r10;
}
static inline unsigned long
get_syscall_arg5(syscall_args *args)
{
return args->r8;
}
static inline unsigned long
get_syscall_arg6(syscall_args *args)
{
return args->r9;
}
static inline void
set_syscall_number(syscall_args *args, unsigned long value)
{
args->orig_rax = value;
}
static inline void
set_syscall_return(syscall_args *args, unsigned long value)
{
args->rax = value;
}
static inline void
set_syscall_arg1(syscall_args *args, unsigned long value)
{
args->rdi = value;
}
static inline void
set_syscall_arg2(syscall_args *args, unsigned long value)
{
args->rsi = value;
}
static inline void
set_syscall_arg3(syscall_args *args, unsigned long value)
{
args->rdx = value;
}
static inline void
set_syscall_arg4(syscall_args *args, unsigned long value)
{
args->r10 = value;
}
static inline void
set_syscall_arg5(syscall_args *args, unsigned long value)
{
args->r8 = value;
}
static inline void
set_syscall_arg6(syscall_args *args, unsigned long value)
{
args->r9 = value;
}
#endif

View File

@@ -0,0 +1,149 @@
/*
arg: rdi, rsi, rdx, rcx, r8, r9
ret: rax
rax syscall number
syscall: (rax:num) rdi rsi rdx r10 r8 r9 (rcx:ret addr)
fd, cmd, param
rdi: fd
rsi: cmd
rdx: param
rcx: save area
r8: new thread context
*/
.global switch_ctx
switch_ctx:
movq $0,0x00(%rcx)
movq %rax,0x8(%rcx)
movq %rbx,0x10(%rcx)
movq %rcx,0x18(%rcx)
movq %rdx,0x20(%rcx)
movq %rsi,0x28(%rcx)
movq %rdi,0x30(%rcx)
movq %rbp,0x38(%rcx)
movq %r8,0x40(%rcx)
movq %r9,0x48(%rcx)
movq %r10,0x50(%rcx)
movq %r11,0x58(%rcx)
movq %r12,0x60(%rcx)
movq %r13,0x68(%rcx)
movq %r14,0x70(%rcx)
movq %r15,0x78(%rcx)
pushfq
popq %rax
movq %rax,0x80(%rcx)
movq 0x00(%rsp),%rax
movq %rax,0x88(%rcx)
movq %rsp,0x90(%rcx)
movq %rcx,%r10
pushq %rcx
pushq %r8
pushq %rax
mov $0x10,%eax /* ioctl */
syscall
3:
popq %r8
popq %r8
popq %rcx
movq %r10,%rcx
cmp $0xfffffffffffff001,%eax
jae 1f
test %eax,%eax
jnz 2f
pushq %rax
movq $158,%rax /* arch_prctl */
movq $0x1002,%rdi /* ARCH_SET_FS */
movq 0x98(%r8),%rsi
syscall
popq %rax
movq 0x10(%r8),%rbx
movq 0x18(%r8),%rcx
movq 0x20(%r8),%rdx
movq 0x28(%r8),%rsi
movq 0x30(%r8),%rdi
movq 0x38(%r8),%rbp
movq 0x48(%r8),%r9
movq 0x50(%r8),%r10
movq 0x58(%r8),%r11
movq 0x60(%r8),%r12
movq 0x68(%r8),%r13
movq 0x70(%r8),%r14
movq 0x78(%r8),%r15
movq 0x80(%r8),%rax
pushq %rax
popfq
movq 0x90(%r8),%rsp
// movq 0x8(%r8),%rax /* for interrupts */
movq 0x40(%r8),%r8
movq $0,%rax /* ioctl return */
pushq %rcx
retq
1:
mov $0xffffffffffffffff,%eax
2:
pushq %rax
movq $158,%rax /* arch_prctl */
movq $0x1002,%rdi /* ARCH_SET_FS */
movq 0x98(%rcx),%rsi
syscall
popq %rax
movq 0x10(%rcx),%rbx
movq 0x28(%rcx),%rsi
movq 0x30(%rcx),%rdi
movq 0x38(%rcx),%rbp
movq 0x40(%rcx),%r8
movq 0x48(%rcx),%r9
movq 0x50(%rcx),%r10
movq 0x58(%rcx),%r11
movq 0x60(%rcx),%r12
movq 0x68(%rcx),%r13
movq 0x70(%rcx),%r14
movq 0x78(%rcx),%r15
movq 0x80(%rcx),%rdx
pushq %rdx
popfq
movq 0x20(%rcx),%rdx
movq 0x18(%rcx),%rcx
retq
/*
arg: rdi, rsi, rdx, rcx, r8, r9
ret: rax
unsigned long
compare_and_swap(unsigned long *addr, unsigned long old, unsigned long new);
rdi: addr
rsi: old
rdx: new
RET: old value
*/
.global compare_and_swap
compare_and_swap:
movq %rsi,%rax
lock
cmpxchgq %rdx,0(%rdi)
retq
/*
unsigned int
compare_and_swap_int(unsigned int *addr, unsigned int old, unsigned int new);
ret: old value
*/
.global compare_and_swap_int
compare_and_swap_int:
movl %esi,%eax
lock
cmpxchgl %edx,0(%rdi)
retq

3
executer/user/archdep.h Normal file
View File

@@ -0,0 +1,3 @@
extern int switch_ctx(int fd, unsigned long cmd, void **param, void *lctx, void *rctx);
extern unsigned long compare_and_swap(unsigned long *addr, unsigned long old, unsigned long new);
extern unsigned int compare_and_swap_int(unsigned int *addr, unsigned int old, unsigned int new);

View File

@@ -16,6 +16,8 @@
#include <unistd.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <sys/ioctl.h>
#include <ihk/ihk_host_user.h>
#define CPU_TID_BASE 1000000
@@ -25,6 +27,10 @@ struct options {
char *kernel_path;
char *dump_path;
char *log_path;
int interactive;
int os_id;
int mcos_fd;
int print_idle;
}; /* struct options */
struct thread_info {
@@ -42,7 +48,7 @@ struct thread_info {
int tid;
int cpu;
int lcpu;
int padding;
int idle;
uintptr_t process;
uintptr_t clv;
uintptr_t x86_clv;
@@ -53,6 +59,7 @@ static volatile int f_done = 0;
static bfd *symbfd = NULL;
static bfd *dumpbfd = NULL;
static asection *dumpscn = NULL;
static dump_mem_chunks_t *mem_chunks;
static int num_processors = -1;
static asymbol **symtab = NULL;
static ssize_t nsyms;
@@ -91,25 +98,35 @@ static uintptr_t virt_to_phys(uintptr_t va) {
static int read_physmem(uintptr_t pa, void *buf, size_t size) {
off_t off;
bfd_boolean ok;
int i;
if (pa < dumpscn->vma) {
printf("read_physmem(%lx,%p,%lx):too small pa. vma %lx\n", pa, buf, size, dumpscn->vma);
return 1;
}
off = pa - dumpscn->vma;
if (off >= dumpscn->size) {
printf("read_physmem(%lx,%p,%lx):too large pa. vma %lx size %lx\n", pa, buf, size, dumpscn->vma, dumpscn->size);
return 1;
}
if ((dumpscn->size - off) < size) {
printf("read_physmem(%lx,%p,%lx):too large size. vma %lx size %lx\n", pa, buf, size, dumpscn->vma, dumpscn->size);
off = 0;
/* Check if pa is valid in any chunks and figure
* out the global offset in dump section */
for (i = 0; i < mem_chunks->nr_chunks; ++i) {
if (mem_chunks->chunks[i].addr <= pa &&
((pa + size) <= (mem_chunks->chunks[i].addr +
mem_chunks->chunks[i].size))) {
off += (pa - mem_chunks->chunks[i].addr);
break;
}
off += mem_chunks->chunks[i].size;
}
if (i == mem_chunks->nr_chunks) {
printf("read_physmem: invalid addr 0x%lx\n", pa);
return 1;
}
ok = bfd_get_section_contents(dumpbfd, dumpscn, buf, off, size);
if (!ok) {
bfd_perror("read_physmem:bfd_get_section_contents");
return 1;
}
return 0;
} /* read_physmem() */
@@ -125,7 +142,21 @@ static int read_mem(uintptr_t va, void *buf, size_t size) {
}
return 1;
}
error = read_physmem(pa, buf, size);
if (opt.interactive) {
dumpargs_t args;
args.cmd = DUMP_READ;
args.start = pa;
args.size = size;
args.buf = buf;
error = ioctl(opt.mcos_fd, IHK_OS_DUMP, &args);
}
else {
error = read_physmem(pa, buf, size);
}
if (error) {
perror("read_mem:read_physmem");
return 1;
@@ -231,6 +262,7 @@ static int setup_threads(void) {
perror("num_processors");
return 1;
}
printf("%s: num_processors: %d\n", __FUNCTION__, num_processors);
error = read_symbol_64("locals", &locals);
if (error) {
@@ -253,64 +285,6 @@ static int setup_threads(void) {
ihk_mc_switch_context = lookup_symbol("ihk_mc_switch_context");
if (0) printf("ihk_mc_switch_context: %lx\n", ihk_mc_switch_context);
/* Set up idle threads first */
for (cpu = 0; cpu < num_processors; ++cpu) {
uintptr_t v;
uintptr_t thread;
uintptr_t proc;
int pid;
int tid;
struct thread_info *ti;
int status;
v = clv + (cpu * K(CPU_LOCAL_VAR_SIZE));
ti = malloc(sizeof(*ti));
if (!ti) {
perror("malloc");
return 1;
}
thread = v+K(IDLE_THREAD_OFFSET);
error = read_64(thread+K(PROC_OFFSET), &proc);
if (error) {
perror("proc");
return 1;
}
error = read_32(thread+K(STATUS_OFFSET), &status);
if (error) {
perror("status");
return 1;
}
error = read_32(proc+K(PID_OFFSET), &pid);
if (error) {
perror("pid");
return 1;
}
error = read_32(thread+K(TID_OFFSET), &tid);
if (error) {
perror("tid");
return 1;
}
ti->next = NULL;
ti->status = status;
ti->pid = pid;
ti->tid = tid;
ti->cpu = cpu;
ti->lcpu = cpu;
ti->process = thread;
ti->clv = v;
ti->x86_clv = locals + locals_span*cpu;
*titailp = ti;
titailp = &ti->next;
}
for (cpu = 0; cpu < num_processors; ++cpu) {
uintptr_t v;
uintptr_t head;
@@ -375,15 +349,19 @@ static int setup_threads(void) {
ti->status = status;
ti->pid = pid;
ti->tid = tid;
ti->cpu = (thread == current)? cpu: -1;
ti->cpu = (thread == current) ? cpu : -1;
ti->lcpu = cpu;
ti->process = thread;
ti->idle = 0;
ti->clv = v;
ti->x86_clv = locals + locals_span*cpu;
*titailp = ti;
titailp = &ti->next;
if (!curr_thread)
curr_thread = ti;
error = read_64(entry, &entry);
if (error) {
perror("process2");
@@ -392,8 +370,78 @@ static int setup_threads(void) {
}
}
/* Set up idle threads */
if (opt.print_idle) {
for (cpu = 0; cpu < num_processors; ++cpu) {
uintptr_t v;
uintptr_t thread;
uintptr_t proc;
int pid;
int tid;
struct thread_info *ti;
int status;
v = clv + (cpu * K(CPU_LOCAL_VAR_SIZE));
error = read_64(v+K(CURRENT_OFFSET), &current);
if (error) {
perror("current");
return 1;
}
ti = malloc(sizeof(*ti));
if (!ti) {
perror("malloc");
return 1;
}
thread = v+K(IDLE_THREAD_OFFSET);
error = read_64(thread+K(PROC_OFFSET), &proc);
if (error) {
perror("proc");
return 1;
}
error = read_32(thread+K(STATUS_OFFSET), &status);
if (error) {
perror("status");
return 1;
}
error = read_32(proc+K(PID_OFFSET), &pid);
if (error) {
perror("pid");
return 1;
}
error = read_32(thread+K(TID_OFFSET), &tid);
if (error) {
perror("tid");
return 1;
}
ti->next = NULL;
ti->status = status;
ti->pid = 1;
ti->tid = 2000000000 + tid;
ti->cpu = (thread == current) ? cpu : -1;
ti->lcpu = cpu;
ti->process = thread;
ti->idle = 1;
ti->clv = v;
ti->x86_clv = locals + locals_span*cpu;
*titailp = ti;
titailp = &ti->next;
if (!curr_thread)
curr_thread = ti;
}
}
if (!tihead) {
printf("thread not found. cpu mode forcibly\n");
printf("No threads found, forcing CPU mode.\n");
opt.cpu = 1;
}
@@ -434,6 +482,7 @@ static int setup_threads(void) {
ti->tid = CPU_TID_BASE + cpu;
ti->cpu = cpu;
ti->process = current;
ti->idle = 1;
ti->clv = v;
ti->x86_clv = locals + locals_span*cpu;
@@ -446,7 +495,9 @@ static int setup_threads(void) {
printf("thread not found\n");
return 1;
}
curr_thread = tihead;
if (!curr_thread)
curr_thread = tihead;
return 0;
} /* setup_threads() */
@@ -508,13 +559,32 @@ static int setup_dump(char *fname) {
return 1;
}
dumpscn = bfd_get_section_by_name(dumpbfd, "physmem");
mem_chunks = malloc(PHYS_CHUNKS_DESC_SIZE);
if (!mem_chunks) {
perror("allocating mem chunks descriptor: ");
return 1;
}
dumpscn = bfd_get_section_by_name(dumpbfd, "physchunks");
if (!dumpscn) {
bfd_perror("bfd_get_section_by_name");
return 1;
}
kernel_base = dumpscn->vma + 0x200000;
ok = bfd_get_section_contents(dumpbfd, dumpscn, mem_chunks,
0, PHYS_CHUNKS_DESC_SIZE);
if (!ok) {
bfd_perror("read_physmem:bfd_get_section_contents");
return 1;
}
kernel_base = mem_chunks->kernel_base;
dumpscn = bfd_get_section_by_name(dumpbfd, "physmem");
if (!dumpscn) {
bfd_perror("bfd_get_section_by_name");
return 1;
}
return 0;
} /* setup_dump() */
@@ -669,18 +739,21 @@ static void command(char *cmd, char *res) {
break;
}
//if (regs[17] > MAP_KERNEL) {}
pu8 = (void *)&regs;
for (i = 0; i < sizeof(regs)-4; ++i) {
rbp += sprintf(rbp, "%02x", pu8[i]);
}
}
}
/*
else if (!strcmp(p, "mffffffff80018a82,1")) {
rbp += sprintf(rbp, "b8");
}
else if (!strcmp(p, "mffffffff80018a82,9")) {
rbp += sprintf(rbp, "b8f2ffffff41564155");
}
*/
else if (!strncmp(p, "m", 1)) {
int n;
uintptr_t start;
@@ -776,33 +849,35 @@ static void command(char *cmd, char *res) {
break;
}
q = buf;
q += sprintf(q, "PID %d, ", ti->pid);
if (ti->status & PS_RUNNING) {
q += sprintf(q, "running on cpu%d", ti->cpu);
q += sprintf(q, "%srunning on cpu %d",
ti->idle ? "idle " : "", ti->lcpu);
}
else if (ti->status & (PS_INTERRUPTIBLE | PS_UNINTERRUPTIBLE)) {
q += sprintf(q, "waiting on cpu%d", ti->lcpu);
q += sprintf(q, "%swaiting on cpu %d",
ti->idle ? "idle " : "", ti->lcpu);
}
else if (ti->status & PS_STOPPED) {
q += sprintf(q, "stopped on cpu%d", ti->lcpu);
q += sprintf(q, "%sstopped on cpu %d",
ti->idle ? "idle " : "", ti->lcpu);
}
else if (ti->status & PS_TRACED) {
q += sprintf(q, "traced on cpu%d", ti->lcpu);
q += sprintf(q, "%straced on cpu %d",
ti->idle ? "idle " : "", ti->lcpu);
}
else if (ti->status == CS_IDLE) {
q += sprintf(q, "cpu%d idle", ti->cpu);
q += sprintf(q, "cpu %d idle", ti->cpu);
}
else if (ti->status == CS_RUNNING) {
q += sprintf(q, "cpu%d running", ti->cpu);
q += sprintf(q, "cpu %d running", ti->cpu);
}
else if (ti->status == CS_RESERVED) {
q += sprintf(q, "cpu%d reserved", ti->cpu);
q += sprintf(q, "cpu %d reserved", ti->cpu);
}
else {
q += sprintf(q, "status=%#x", ti->status);
}
if (ti->tid != ti->pid) {
q += sprintf(q, ",pid=%d", ti->pid);
}
rbp += print_hex(rbp, buf);
}
} while (0);
@@ -815,11 +890,12 @@ static void options(int argc, char *argv[]) {
memset(&opt, 0, sizeof(opt));
opt.kernel_path = "./mckernel.img";
opt.dump_path = "./mcdump";
opt.mcos_fd = -1;
for (;;) {
int c;
c = getopt(argc, argv, "cd:hk:");
c = getopt(argc, argv, "ilcd:hk:o:");
if (c < 0) {
break;
}
@@ -837,12 +913,32 @@ static void options(int argc, char *argv[]) {
case 'd':
opt.dump_path = optarg;
break;
case 'i':
opt.interactive = 1;
break;
case 'o':
opt.os_id = atoi(optarg);
break;
case 'l':
opt.print_idle = 1;
break;
}
}
if (optind < argc) {
opt.help = 1;
}
if (opt.interactive) {
char fn[128];
sprintf(fn, "/dev/mcos%d", opt.os_id);
opt.mcos_fd = open(fn, O_RDONLY);
if (opt.mcos_fd < 0) {
perror("open");
exit(1);
}
}
return;
} /* options() */
@@ -925,7 +1021,7 @@ int main(int argc, char *argv[]) {
uint8_t sum;
uint8_t check;
static char lbuf[1024];
static char rbuf[1024];
static char rbuf[8192];
static char cbuf[3];
char *lbp;
char *p;

View File

@@ -0,0 +1,27 @@
#define _GNU_SOURCE
#include <dlfcn.h>
#include <sys/time.h>
#include <sched.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#undef sched_yield
typedef int (*int_void_fn)(void);
static int_void_fn orig_sched_yield = 0;
int sched_yield(void)
{
#if 0
if (!orig_sched_yield) {
orig_sched_yield = (int_void_fn)dlsym(RTLD_NEXT, "sched_yield");
}
printf("sched_yield() called\n");
#endif
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@@ -3,15 +3,16 @@ SRC=$(VPATH)
IHKDIR=$(IHKBASE)/$(TARGETDIR)
OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o
OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o shmobj.o
OBJS += zeroobj.o procfs.o devobj.o sysfs.o
OBJS += zeroobj.o procfs.o devobj.o sysfs.o xpmem.o profile.o freeze.o
OBJS += rbtree.o
DEPSRCS=$(wildcard $(SRC)/*.c)
CFLAGS += -I$(SRC)/include -D__KERNEL__ -g
CFLAGS += -I$(SRC)/include -I@abs_builddir@/../ -I@abs_builddir@/include -D__KERNEL__ -g -fno-omit-frame-pointer -fno-inline -fno-inline-small-functions
LDFLAGS += -e arch_start
IHKOBJ = ihk/ihk.o
include $(SRC)/config/config.$(TARGET)
include $(IHKBASE)/Makefile.common
include @abs_builddir@/../../ihk/cokernel/Makefile.common
# CFLAGS += -I$(SRC)/../arch/$(IHKARCH)/kernel/include -I$(SRC)/../lib/include

View File

@@ -9,7 +9,7 @@ V ?= $(VERBOSE)
KERNEL = kernel.img
KERNELS = $(addsuffix /$(KERNEL),$(addprefix $(O)/,$(BUILD_TARGET)))
SUBCMD_OPTS = V='$(V)'
SUBCMD_OPTS = V='$(V)' BUILD_IHK_COKERNEL=@abs_builddir@/../../ihk/cokernel
$(if $(O),,$(error Specify the compilation target directory))
#$(if $(shell ls $(IHKBASE)/Makefile),,\

View File

@@ -25,10 +25,25 @@
#include <init.h>
#include <march.h>
#include <cls.h>
#include <time.h>
#include <syscall.h>
#include <rusage.h>
//#define DEBUG_PRINT_AP
#ifdef DEBUG_PRINT_AP
#define dkprintf(...) do { kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) do { kprintf(__VA_ARGS__); } while (0)
#else
#define dkprintf(...) do { } while (0)
#define ekprintf(...) do { kprintf(__VA_ARGS__); } while (0)
#endif
int num_processors = 1;
static volatile int ap_stop = 1;
mcs_lock_node_t ap_syscall_semaphore;
static void ap_wait(void)
{
init_tick();
@@ -43,7 +58,15 @@ static void ap_wait(void)
arch_start_pvclock();
if (find_command_line("hidos")) {
init_host_syscall_channel();
mcs_lock_node_t mcs_node;
int ikc_cpu = ihk_mc_get_ikc_cpu(ihk_mc_get_processor_id());
if(ikc_cpu < 0) {
ekprintf("%s,ihk_mc_get_ikc_cpu failed\n", __FUNCTION__);
}
mcs_lock_lock_noirq(&ap_syscall_semaphore, &mcs_node);
init_host_ikc2mckernel();
init_host_ikc2linux(ikc_cpu);
mcs_lock_unlock_noirq(&ap_syscall_semaphore, &mcs_node);
}
pc_ap_init();
@@ -57,6 +80,7 @@ static void ap_wait(void)
void ap_start(void)
{
init_tick();
mcs_lock_init(&ap_syscall_semaphore);
ap_stop = 0;
sync_tick();
}
@@ -65,7 +89,7 @@ void ap_init(void)
{
struct ihk_mc_cpu_info *cpu_info;
int i;
int bsp_hw_id;
int bsp_hw_id, bsp_cpu_id;
ihk_mc_init_ap();
init_delay();
@@ -78,18 +102,28 @@ void ap_init(void)
return;
}
kprintf("BSP HW ID = %d\n", bsp_hw_id);
bsp_cpu_id = 0;
for (i = 0; i < cpu_info->ncpus; ++i) {
if (cpu_info->hw_ids[i] == bsp_hw_id) {
bsp_cpu_id = i;
break;
}
}
kprintf("BSP: %d (HW ID: %d @ NUMA %d)\n", bsp_cpu_id,
bsp_hw_id, cpu_info->nodes[0]);
for (i = 0; i < cpu_info->ncpus; i++) {
if (cpu_info->hw_ids[i] == bsp_hw_id) {
continue;
}
kprintf("AP Booting: %d (HW ID: %d)\n", i, cpu_info->hw_ids[i]);
dkprintf("AP Booting: %d (HW ID: %d @ NUMA %d)\n", i,
cpu_info->hw_ids[i], cpu_info->nodes[i]);
ihk_mc_boot_cpu(cpu_info->hw_ids[i], (unsigned long)ap_wait);
num_processors++;
}
kprintf("AP Booting: Done\n");
kprintf("BSP: booted %d AP CPUs\n", cpu_info->ncpus - 1);
}
#include <sysfs.h>
@@ -199,7 +233,7 @@ cpu_sysfs_setup(void)
/* setup table */
info = kmalloc(sizeof(*info) * num_processors, IHK_MC_AP_CRITICAL);
for (cpu = 0; cpu < num_processors; ++cpu) {
info[cpu].online = 10+cpu;
info[cpu].online = 1;
}
fake_cpu_infos = info;

View File

@@ -19,21 +19,29 @@
#include <ihk/page_alloc.h>
#include <cls.h>
#include <page.h>
#include <rusage.h>
extern int num_processors;
struct cpu_local_var *clv;
static int cpu_local_var_initialized = 0;
int cpu_local_var_initialized = 0;
void cpu_local_var_init(void)
{
int z;
int i;
z = sizeof(struct cpu_local_var) * num_processors;
z = (z + PAGE_SIZE - 1) >> PAGE_SHIFT;
clv = allocate_pages(z, IHK_MC_AP_CRITICAL);
clv = ihk_mc_alloc_pages(z, IHK_MC_AP_CRITICAL);
memset(clv, 0, z * PAGE_SIZE);
for (i = 0; i < num_processors; i++) {
clv[i].monitor = monitor->cpu + i;
INIT_LIST_HEAD(&clv[i].smp_func_req_list);
}
cpu_local_var_initialized = 1;
}

View File

@@ -37,6 +37,8 @@ static void kprintf_wait(int len, unsigned long *flags_head, int *slide) {
if (head < tail) head += buf_len;
if (tail + len > buf_len) adj = buf_len - tail;
if (head > tail && head <= tail + len + adj) {
/* When proceeding tail (producer pointer) by len would
cross head (consumer pointer) in ring-buffer */
if (mode != 1) {
*slide = 1;
break;
@@ -70,6 +72,9 @@ void kputs(char *buf)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
/* When proceeding tail (producer pointer) by len would
cross head (consumer pointer) in ring-buffer, give up
[head, tail] because the range is overwritten */
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
@@ -170,6 +175,17 @@ int kprintf(const char *format, ...)
return len;
}
/* mode:
0: mcklogd is not running.
When kmsg buffer is full, writer doesn't block
and overwrites the buffer.
1: mcklogd periodically retrieves kmsg.
When kmsg buffer is full, writer blocks until
someone retrieves kmsg.
2: mcklogd periodically retrieves kmsg.
When kmsg buffer is full, writer doesn't block
and overwrites the buffer.
*/
void kmsg_init(int mode)
{
ihk_mc_spinlock_init(&kmsg_lock);

View File

@@ -126,7 +126,8 @@ int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxp
__FUNCTION__, fd, len, off, result.handle, result.maxprot);
obj->memobj.ops = &devobj_ops;
obj->memobj.flags = MF_HAS_PAGER;
obj->memobj.flags = MF_HAS_PAGER | MF_DEV_FILE;
obj->memobj.size = len;
obj->handle = result.handle;
obj->ref = 1;
obj->pfn_pgoff = off / PAGE_SIZE;
@@ -180,19 +181,21 @@ static void devobj_release(struct memobj *memobj)
memobj_unlock(&obj->memobj);
if (free_obj) {
int error;
ihk_mc_user_context_t ctx;
if (!(free_obj->memobj.flags & MF_HOST_RELEASED)) {
int error;
ihk_mc_user_context_t ctx;
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_UNMAP;
ihk_mc_syscall_arg1(&ctx) = handle;
ihk_mc_syscall_arg2(&ctx) = 1;
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_UNMAP;
ihk_mc_syscall_arg1(&ctx) = handle;
ihk_mc_syscall_arg2(&ctx) = 1;
error = syscall_generic_forwarding(__NR_mmap, &ctx);
if (error) {
kprintf("devobj_release(%p %lx):"
"release failed. %d\n",
free_obj, handle, error);
/* through */
error = syscall_generic_forwarding(__NR_mmap, &ctx);
if (error) {
kprintf("devobj_release(%p %lx):"
"release failed. %d\n",
free_obj, handle, error);
/* through */
}
}
if (obj->pfn_table) {
@@ -228,6 +231,9 @@ static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintpt
memobj_lock(&obj->memobj);
pfn = obj->pfn_table[ix];
#ifdef PROFILE_ENABLE
profile_event_add(PROFILE_page_fault_dev_file, PAGE_SIZE);
#endif // PROFILE_ENABLE
if (!(pfn & PFN_VALID)) {
memobj_unlock(&obj->memobj);

View File

@@ -29,22 +29,26 @@
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) kprintf(__VA_ARGS__)
static ihk_spinlock_t fileobj_list_lock = SPIN_LOCK_UNLOCKED;
mcs_rwlock_lock_t fileobj_list_lock;
static LIST_HEAD(fileobj_list);
#define FILEOBJ_PAGE_HASH_SHIFT 9
#define FILEOBJ_PAGE_HASH_SIZE (1 << FILEOBJ_PAGE_HASH_SHIFT)
#define FILEOBJ_PAGE_HASH_MASK (FILEOBJ_PAGE_HASH_SIZE - 1)
struct fileobj {
struct memobj memobj; /* must be first */
long sref;
long cref;
uintptr_t handle;
struct list_head page_list;
struct list_head list;
struct memobj memobj; /* must be first */
long sref;
long cref;
uintptr_t handle;
struct list_head list;
struct list_head page_hash[FILEOBJ_PAGE_HASH_SIZE];
mcs_rwlock_lock_t page_hash_locks[FILEOBJ_PAGE_HASH_SIZE];
};
static memobj_release_func_t fileobj_release;
static memobj_ref_func_t fileobj_ref;
static memobj_get_page_func_t fileobj_get_page;
static memobj_copy_page_func_t fileobj_copy_page;
static memobj_flush_page_func_t fileobj_flush_page;
static memobj_invalidate_page_func_t fileobj_invalidate_page;
static memobj_lookup_page_func_t fileobj_lookup_page;
@@ -53,7 +57,7 @@ static struct memobj_ops fileobj_ops = {
.release = &fileobj_release,
.ref = &fileobj_ref,
.get_page = &fileobj_get_page,
.copy_page = &fileobj_copy_page,
.copy_page = NULL,
.flush_page = &fileobj_flush_page,
.invalidate_page = &fileobj_invalidate_page,
.lookup_page = &fileobj_lookup_page,
@@ -72,28 +76,36 @@ static struct memobj *to_memobj(struct fileobj *fileobj)
/***********************************************************************
* page_list
*/
static void page_list_init(struct fileobj *obj)
static void fileobj_page_hash_init(struct fileobj *obj)
{
INIT_LIST_HEAD(&obj->page_list);
int i;
for (i = 0; i < FILEOBJ_PAGE_HASH_SIZE; ++i) {
mcs_rwlock_init(&obj->page_hash_locks[i]);
INIT_LIST_HEAD(&obj->page_hash[i]);
}
return;
}
static void page_list_insert(struct fileobj *obj, struct page *page)
/* NOTE: caller must hold page_hash_locks[hash] */
static void __fileobj_page_hash_insert(struct fileobj *obj,
struct page *page, int hash)
{
list_add(&page->list, &obj->page_list);
return;
list_add(&page->list, &obj->page_hash[hash]);
}
static void page_list_remove(struct fileobj *obj, struct page *page)
/* NOTE: caller must hold page_hash_locks[hash] */
static void __fileobj_page_hash_remove(struct page *page)
{
list_del(&page->list);
}
static struct page *page_list_lookup(struct fileobj *obj, off_t off)
/* NOTE: caller must hold page_hash_locks[hash] */
static struct page *__fileobj_page_hash_lookup(struct fileobj *obj,
int hash, off_t off)
{
struct page *page;
list_for_each_entry(page, &obj->page_list, list) {
list_for_each_entry(page, &obj->page_hash[hash], list) {
if ((page->mode != PM_WILL_PAGEIO)
&& (page->mode != PM_PAGEIO)
&& (page->mode != PM_DONE_PAGEIO)
@@ -104,6 +116,7 @@ static struct page *page_list_lookup(struct fileobj *obj, off_t off)
obj, off, page->mode);
panic("page_list_lookup:invalid obj page");
}
if (page->offset == off) {
goto out;
}
@@ -114,13 +127,22 @@ out:
return page;
}
static struct page *page_list_first(struct fileobj *obj)
static struct page *fileobj_page_hash_first(struct fileobj *obj)
{
if (list_empty(&obj->page_list)) {
return NULL;
int i;
for (i = 0; i < FILEOBJ_PAGE_HASH_SIZE; ++i) {
if (!list_empty(&obj->page_hash[i])) {
break;
}
}
return list_first_entry(&obj->page_list, struct page, list);
if (i != FILEOBJ_PAGE_HASH_SIZE) {
return list_first_entry(&obj->page_hash[i], struct page, list);
}
else {
return NULL;
}
}
/***********************************************************************
@@ -163,10 +185,11 @@ static struct fileobj *obj_list_lookup(uintptr_t handle)
int fileobj_create(int fd, struct memobj **objp, int *maxprotp)
{
ihk_mc_user_context_t ctx;
struct pager_create_result result; // XXX: assumes contiguous physical
struct pager_create_result result __attribute__((aligned(64)));
int error;
struct fileobj *newobj = NULL;
struct fileobj *obj;
struct mcs_rwlock_node node;
dkprintf("fileobj_create(%d)\n", fd);
newobj = kmalloc(sizeof(*newobj), IHK_MC_AP_NOWAIT);
@@ -179,6 +202,7 @@ int fileobj_create(int fd, struct memobj **objp, int *maxprotp)
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_CREATE;
ihk_mc_syscall_arg1(&ctx) = fd;
ihk_mc_syscall_arg2(&ctx) = virt_to_phys(&result);
memset(&result, 0, sizeof(result));
error = syscall_generic_forwarding(__NR_mmap, &ctx);
if (error) {
@@ -188,27 +212,91 @@ int fileobj_create(int fd, struct memobj **objp, int *maxprotp)
memset(newobj, 0, sizeof(*newobj));
newobj->memobj.ops = &fileobj_ops;
newobj->memobj.flags = MF_HAS_PAGER;
newobj->memobj.flags = MF_HAS_PAGER | MF_REG_FILE;
newobj->handle = result.handle;
newobj->sref = 1;
newobj->cref = 1;
page_list_init(newobj);
fileobj_page_hash_init(newobj);
ihk_mc_spinlock_init(&newobj->memobj.lock);
ihk_mc_spinlock_lock_noirq(&fileobj_list_lock);
mcs_rwlock_writer_lock_noirq(&fileobj_list_lock, &node);
obj = obj_list_lookup(result.handle);
if (!obj) {
obj_list_insert(newobj);
obj = newobj;
to_memobj(obj)->size = result.size;
to_memobj(obj)->flags |= result.flags;
to_memobj(obj)->status = MEMOBJ_READY;
if (to_memobj(obj)->flags & MF_PREFETCH) {
to_memobj(obj)->status = MEMOBJ_TO_BE_PREFETCHED;
}
/* XXX: KNL specific optimization for OFP runs */
if ((to_memobj(obj)->flags & MF_PREMAP) &&
(to_memobj(obj)->flags & MF_ZEROFILL)) {
struct memobj *mo = to_memobj(obj);
int nr_pages = (result.size + (PAGE_SIZE - 1))
>> PAGE_SHIFT;
int j = 0;
int node = ihk_mc_get_nr_numa_nodes() / 2;
dkprintf("%s: MF_PREMAP, start node: %d\n",
__FUNCTION__, node);
mo->pages = kmalloc(nr_pages * sizeof(void *), IHK_MC_AP_NOWAIT);
if (!mo->pages) {
kprintf("%s: WARNING: failed to allocate pages\n",
__FUNCTION__);
goto error_cleanup;
}
mo->nr_pages = nr_pages;
memset(mo->pages, 0, nr_pages * sizeof(*mo->pages));
if (cpu_local_var(current)->proc->mpol_flags & MPOL_SHM_PREMAP) {
/* Get the actual pages NUMA interleaved */
for (j = 0; j < nr_pages; ++j) {
mo->pages[j] = ihk_mc_alloc_aligned_pages_node_user(1,
PAGE_P2ALIGN, IHK_MC_AP_NOWAIT, node);
if (!mo->pages[j]) {
kprintf("%s: ERROR: allocating pages[%d]\n",
__FUNCTION__, j);
goto error_cleanup;
}
memset(mo->pages[j], 0, PAGE_SIZE);
++node;
if (node == ihk_mc_get_nr_numa_nodes()) {
node = ihk_mc_get_nr_numa_nodes() / 2;
}
}
dkprintf("%s: allocated %d pages interleaved\n",
__FUNCTION__, nr_pages);
}
error_cleanup:
/* TODO: cleanup allocated portion */
;
}
newobj = NULL;
dkprintf("%s: new obj 0x%lx cref: %d, %s\n",
__FUNCTION__,
obj,
obj->cref,
to_memobj(obj)->flags & MF_ZEROFILL ? "zerofill" : "");
}
else {
++obj->sref;
++obj->cref;
memobj_unlock(&obj->memobj); /* locked by obj_list_lookup() */
dkprintf("%s: existing obj 0x%lx cref: %d, %s\n",
__FUNCTION__,
obj,
obj->cref,
to_memobj(obj)->flags & MF_ZEROFILL ? "zerofill" : "");
}
ihk_mc_spinlock_unlock_noirq(&fileobj_list_lock);
mcs_rwlock_writer_unlock_noirq(&fileobj_list_lock, &node);
error = 0;
*objp = to_memobj(obj);
@@ -239,6 +327,7 @@ static void fileobj_release(struct memobj *memobj)
long free_sref = 0;
uintptr_t free_handle;
struct fileobj *free_obj = NULL;
struct mcs_rwlock_node node;
dkprintf("fileobj_release(%p %lx)\n", obj, obj->handle);
@@ -252,19 +341,41 @@ static void fileobj_release(struct memobj *memobj)
obj->sref -= free_sref;
free_handle = obj->handle;
memobj_unlock(&obj->memobj);
if (obj->memobj.flags & MF_HOST_RELEASED) {
free_sref = 0; // don't call syscall_generic_forwarding
}
if (free_obj) {
ihk_mc_spinlock_lock_noirq(&fileobj_list_lock);
dkprintf("%s: release obj 0x%lx cref: %d, free_obj: 0x%lx, %s\n",
__FUNCTION__,
obj,
obj->cref,
free_obj,
to_memobj(obj)->flags & MF_ZEROFILL ? "zerofill" : "");
mcs_rwlock_writer_lock_noirq(&fileobj_list_lock, &node);
/* zap page_list */
for (;;) {
struct page *page;
int count;
void *page_va;
page = page_list_first(obj);
page = fileobj_page_hash_first(obj);
if (!page) {
break;
}
page_list_remove(obj, page);
__fileobj_page_hash_remove(page);
page_va = phys_to_virt(page_to_phys(page));
if (ihk_atomic_read(&page->count) != 1) {
kprintf("%s: WARNING: page count %d for phys 0x%lx is invalid, flags: 0x%lx\n",
__FUNCTION__,
ihk_atomic_read(&page->count),
page->phys,
to_memobj(free_obj)->flags);
}
else if (page_unmap(page)) {
ihk_mc_free_pages_user(page_va, 1);
}
#if 0
count = ihk_atomic_sub_return(1, &page->count);
if (!((page->mode == PM_WILL_PAGEIO)
@@ -281,10 +392,23 @@ static void fileobj_release(struct memobj *memobj)
}
page->mode = PM_NONE;
free_pages(phys_to_virt(page_to_phys(page)), 1);
#endif
}
/* Pre-mapped? */
if (to_memobj(free_obj)->flags & MF_PREMAP) {
int i;
for (i = 0; i < to_memobj(free_obj)->nr_pages; ++i) {
if (to_memobj(free_obj)->pages[i])
ihk_mc_free_pages_user(to_memobj(free_obj)->pages[i], 1);
}
kfree(to_memobj(free_obj)->pages);
}
obj_list_remove(free_obj);
ihk_mc_spinlock_unlock_noirq(&fileobj_list_lock);
mcs_rwlock_writer_unlock_noirq(&fileobj_list_lock, &node);
kfree(free_obj);
}
@@ -330,83 +454,144 @@ static void fileobj_do_pageio(void *args0)
struct page *page;
ihk_mc_user_context_t ctx;
ssize_t ss;
struct mcs_rwlock_node mcs_node;
int hash = (off >> PAGE_SHIFT) & FILEOBJ_PAGE_HASH_MASK;
memobj_lock(&obj->memobj);
page = page_list_lookup(obj, off);
mcs_rwlock_writer_lock_noirq(&obj->page_hash_locks[hash],
&mcs_node);
page = __fileobj_page_hash_lookup(obj, hash, off);
if (!page) {
goto out;
}
while (page->mode == PM_PAGEIO) {
memobj_unlock(&obj->memobj);
mcs_rwlock_writer_unlock_noirq(&obj->page_hash_locks[hash],
&mcs_node);
cpu_pause();
memobj_lock(&obj->memobj);
mcs_rwlock_writer_lock_noirq(&obj->page_hash_locks[hash],
&mcs_node);
}
if (page->mode == PM_WILL_PAGEIO) {
page->mode = PM_PAGEIO;
memobj_unlock(&obj->memobj);
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_READ;
ihk_mc_syscall_arg1(&ctx) = obj->handle;
ihk_mc_syscall_arg2(&ctx) = off;
ihk_mc_syscall_arg3(&ctx) = pgsize;
ihk_mc_syscall_arg4(&ctx) = page_to_phys(page);
ss = syscall_generic_forwarding(__NR_mmap, &ctx);
memobj_lock(&obj->memobj);
if (page->mode != PM_PAGEIO) {
kprintf("fileobj_do_pageio(%p,%lx,%lx):"
"invalid mode %x\n",
obj, off, pgsize, page->mode);
panic("fileobj_do_pageio:invalid page mode");
if (to_memobj(obj)->flags & MF_ZEROFILL) {
void *virt = phys_to_virt(page_to_phys(page));
memset(virt, 0, PAGE_SIZE);
#ifdef PROFILE_ENABLE
profile_event_add(PROFILE_page_fault_file_clr, PAGE_SIZE);
#endif // PROFILE_ENABLE
}
else {
page->mode = PM_PAGEIO;
mcs_rwlock_writer_unlock_noirq(&obj->page_hash_locks[hash],
&mcs_node);
if (ss == 0) {
dkprintf("fileobj_do_pageio(%p,%lx,%lx):EOF? %ld\n",
obj, off, pgsize, ss);
page->mode = PM_PAGEIO_EOF;
goto out;
}
else if (ss != pgsize) {
kprintf("fileobj_do_pageio(%p,%lx,%lx):"
"read failed. %ld\n",
obj, off, pgsize, ss);
page->mode = PM_PAGEIO_ERROR;
goto out;
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_READ;
ihk_mc_syscall_arg1(&ctx) = obj->handle;
ihk_mc_syscall_arg2(&ctx) = off;
ihk_mc_syscall_arg3(&ctx) = pgsize;
ihk_mc_syscall_arg4(&ctx) = page_to_phys(page);
dkprintf("%s: __NR_mmap for handle 0x%lx\n",
__FUNCTION__, obj->handle);
ss = syscall_generic_forwarding(__NR_mmap, &ctx);
mcs_rwlock_writer_lock_noirq(&obj->page_hash_locks[hash],
&mcs_node);
if (page->mode != PM_PAGEIO) {
kprintf("fileobj_do_pageio(%p,%lx,%lx):"
"invalid mode %x\n",
obj, off, pgsize, page->mode);
panic("fileobj_do_pageio:invalid page mode");
}
if (ss == 0) {
dkprintf("fileobj_do_pageio(%p,%lx,%lx):EOF? %ld\n",
obj, off, pgsize, ss);
page->mode = PM_PAGEIO_EOF;
goto out;
}
else if (ss != pgsize) {
kprintf("fileobj_do_pageio(%p,%lx,%lx):"
"read failed. %ld\n",
obj, off, pgsize, ss);
page->mode = PM_PAGEIO_ERROR;
goto out;
}
}
page->mode = PM_DONE_PAGEIO;
}
out:
memobj_unlock(&obj->memobj);
mcs_rwlock_writer_unlock_noirq(&obj->page_hash_locks[hash],
&mcs_node);
fileobj_release(&obj->memobj); /* got fileobj_get_page() */
kfree(args0);
dkprintf("fileobj_do_pageio(%p,%lx,%lx):\n", obj, off, pgsize);
return;
}
static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *pflag)
static int fileobj_get_page(struct memobj *memobj, off_t off,
int p2align, uintptr_t *physp, unsigned long *pflag)
{
struct thread *proc = cpu_local_var(current);
struct fileobj *obj = to_fileobj(memobj);
int error;
int error = -1;
void *virt = NULL;
int npages;
uintptr_t phys = -1;
struct page *page;
struct pageio_args *args = NULL;
struct mcs_rwlock_node mcs_node;
int hash = (off >> PAGE_SHIFT) & FILEOBJ_PAGE_HASH_MASK;
dkprintf("fileobj_get_page(%p,%lx,%x,%p)\n", obj, off, p2align, physp);
memobj_lock(&obj->memobj);
if (p2align != PAGE_P2ALIGN) {
error = -ENOMEM;
goto out;
return -ENOMEM;
}
page = page_list_lookup(obj, off);
#ifdef PROFILE_ENABLE
profile_event_add(PROFILE_page_fault_file, PAGE_SIZE);
#endif // PROFILE_ENABLE
if (memobj->flags & MF_PREMAP) {
int page_ind = off >> PAGE_SHIFT;
if (!memobj->pages[page_ind]) {
virt = ihk_mc_alloc_pages_user(1, IHK_MC_AP_NOWAIT | IHK_MC_AP_USER);
if (!virt) {
error = -ENOMEM;
kprintf("fileobj_get_page(%p,%lx,%x,%p):"
"alloc failed. %d\n",
obj, off, p2align, physp,
error);
goto out_nolock;
}
/* Update the array but see if someone did it already and use
* that if so */
if (!__sync_bool_compare_and_swap(&memobj->pages[page_ind],
NULL, virt)) {
ihk_mc_free_pages_user(virt, 1);
}
else {
dkprintf("%s: MF_ZEROFILL: off: %lu -> 0x%lx allocated\n",
__FUNCTION__, off, virt_to_phys(virt));
}
}
virt = memobj->pages[page_ind];
error = 0;
*physp = virt_to_phys(virt);
dkprintf("%s: MF_ZEROFILL: off: %lu -> 0x%lx resolved\n",
__FUNCTION__, off, virt_to_phys(virt));
virt = NULL;
goto out_nolock;
}
mcs_rwlock_writer_lock_noirq(&obj->page_hash_locks[hash],
&mcs_node);
page = __fileobj_page_hash_lookup(obj, hash, off);
if (!page || (page->mode == PM_WILL_PAGEIO)
|| (page->mode == PM_PAGEIO)) {
args = kmalloc(sizeof(*args), IHK_MC_AP_NOWAIT);
@@ -420,7 +605,10 @@ static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintp
if (!page) {
npages = 1 << p2align;
virt = ihk_mc_alloc_pages(npages, IHK_MC_AP_NOWAIT);
virt = ihk_mc_alloc_pages_user(npages, IHK_MC_AP_NOWAIT |
(to_memobj(obj)->flags & MF_ZEROFILL) ? IHK_MC_AP_USER : 0);
if (!virt) {
error = -ENOMEM;
kprintf("fileobj_get_page(%p,%lx,%x,%p):"
@@ -430,17 +618,19 @@ static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintp
goto out;
}
phys = virt_to_phys(virt);
page = phys_to_page(phys);
page = phys_to_page_insert_hash(phys);
if (page->mode != PM_NONE) {
panic("fileobj_get_page:invalid new page");
}
page->mode = PM_WILL_PAGEIO;
page->offset = off;
ihk_atomic_set(&page->count, 1);
page_list_insert(obj, page);
__fileobj_page_hash_insert(obj, page, hash);
page->mode = PM_WILL_PAGEIO;
}
memobj_lock(&obj->memobj);
++obj->cref; /* for fileobj_do_pageio() */
memobj_unlock(&obj->memobj);
args->fileobj = obj;
args->objoff = off;
@@ -472,9 +662,11 @@ static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintp
*physp = page_to_phys(page);
virt = NULL;
out:
memobj_unlock(&obj->memobj);
mcs_rwlock_writer_unlock_noirq(&obj->page_hash_locks[hash],
&mcs_node);
out_nolock:
if (virt) {
ihk_mc_free_pages(virt, npages);
ihk_mc_free_pages_user(virt, npages);
}
if (args) {
kfree(args);
@@ -484,76 +676,6 @@ out:
return error;
}
static uintptr_t fileobj_copy_page(
struct memobj *memobj, uintptr_t orgpa, int p2align)
{
struct page *orgpage = phys_to_page(orgpa);
size_t pgsize = PAGE_SIZE << p2align;
int npages = 1 << p2align;
void *newkva = NULL;
uintptr_t newpa = -1;
void *orgkva;
int count;
dkprintf("fileobj_copy_page(%p,%lx,%d)\n", memobj, orgpa, p2align);
if (p2align != PAGE_P2ALIGN) {
panic("p2align");
}
memobj_lock(memobj);
for (;;) {
if (orgpage->mode != PM_MAPPED) {
kprintf("fileobj_copy_page(%p,%lx,%d):"
"invalid cow page. %x\n",
memobj, orgpa, p2align, orgpage->mode);
panic("fileobj_copy_page:invalid cow page");
}
count = ihk_atomic_read(&orgpage->count);
if (count == 2) { // XXX: private only
list_del(&orgpage->list);
ihk_atomic_dec(&orgpage->count);
orgpage->mode = PM_NONE;
newpa = orgpa;
break;
}
if (count <= 0) {
kprintf("fileobj_copy_page(%p,%lx,%d):"
"orgpage count corrupted. %x\n",
memobj, orgpa, p2align, count);
panic("fileobj_copy_page:orgpage count corrupted");
}
if (newkva) {
orgkva = phys_to_virt(orgpa);
memcpy(newkva, orgkva, pgsize);
ihk_atomic_dec(&orgpage->count);
newpa = virt_to_phys(newkva);
page_map(phys_to_page(newpa));
newkva = NULL; /* avoid ihk_mc_free_pages() */
break;
}
memobj_unlock(memobj);
newkva = ihk_mc_alloc_aligned_pages(npages, p2align,
IHK_MC_AP_NOWAIT);
if (!newkva) {
kprintf("fileobj_copy_page(%p,%lx,%d):"
"alloc page failed\n",
memobj, orgpa, p2align);
goto out;
}
memobj_lock(memobj);
}
memobj_unlock(memobj);
out:
if (newkva) {
ihk_mc_free_pages(newkva, npages);
}
dkprintf("fileobj_copy_page(%p,%lx,%d): %lx\n",
memobj, orgpa, p2align, newpa);
return newpa;
}
static int fileobj_flush_page(struct memobj *memobj, uintptr_t phys,
size_t pgsize)
{
@@ -562,7 +684,20 @@ static int fileobj_flush_page(struct memobj *memobj, uintptr_t phys,
ihk_mc_user_context_t ctx;
ssize_t ss;
if (to_memobj(obj)->flags & MF_ZEROFILL) {
return 0;
}
if (memobj->flags |= MF_HOST_RELEASED) {
return 0;
}
page = phys_to_page(phys);
if (!page) {
kprintf("%s: warning: tried to flush non-existing page for phys addr: 0x%lx\n",
__FUNCTION__, phys);
return 0;
}
memobj_unlock(&obj->memobj);
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_WRITE;
@@ -585,63 +720,48 @@ static int fileobj_flush_page(struct memobj *memobj, uintptr_t phys,
static int fileobj_invalidate_page(struct memobj *memobj, uintptr_t phys,
size_t pgsize)
{
struct fileobj *obj = to_fileobj(memobj);
int error;
struct page *page;
dkprintf("fileobj_invalidate_page(%p,%#lx,%#lx)\n",
memobj, phys, pgsize);
if (!(page = phys_to_page(phys))
|| !(page = page_list_lookup(obj, page->offset))) {
error = 0;
goto out;
}
if (ihk_atomic_read(&page->count) == 1) {
if (page_unmap(page)) {
ihk_mc_free_pages(phys_to_virt(phys),
pgsize/PAGE_SIZE);
}
}
error = 0;
out:
dkprintf("fileobj_invalidate_page(%p,%#lx,%#lx):%d\n",
memobj, phys, pgsize, error);
return error;
/* TODO: keep track of reverse mappings so that invalidation
* can be performed */
kprintf("%s: WARNING: file mapping invalidation not supported\n",
__FUNCTION__);
return 0;
}
static int fileobj_lookup_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *pflag)
static int fileobj_lookup_page(struct memobj *memobj, off_t off,
int p2align, uintptr_t *physp, unsigned long *pflag)
{
struct fileobj *obj = to_fileobj(memobj);
int error;
uintptr_t phys = -1;
int error = -1;
struct page *page;
struct mcs_rwlock_node mcs_node;
int hash = (off >> PAGE_SHIFT) & FILEOBJ_PAGE_HASH_MASK;
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p)\n", obj, off, p2align, physp);
memobj_lock(&obj->memobj);
if (p2align != PAGE_P2ALIGN) {
error = -ENOMEM;
goto out;
return -ENOMEM;
}
page = page_list_lookup(obj, off);
mcs_rwlock_reader_lock_noirq(&obj->page_hash_locks[hash],
&mcs_node);
page = __fileobj_page_hash_lookup(obj, hash, off);
if (!page) {
error = -ENOENT;
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p): page not found. %d\n", obj, off, p2align, physp, error);
goto out;
}
phys = page_to_phys(page);
*physp = page_to_phys(page);
error = 0;
if (physp) {
*physp = phys;
}
out:
memobj_unlock(&obj->memobj);
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p): %d %lx\n",
obj, off, p2align, physp, error, phys);
mcs_rwlock_reader_unlock_noirq(&obj->page_hash_locks[hash],
&mcs_node);
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p): %d \n",
obj, off, p2align, physp, error);
return error;
}

55
kernel/freeze.c Normal file
View File

@@ -0,0 +1,55 @@
#include <kmsg.h>
#include <string.h>
#include <ihk/cpu.h>
#include <ihk/debug.h>
#include <cls.h>
#include <rusage.h>
extern int nmi_mode;
extern void mod_nmi_ctx(void *, void(*)());
extern void lapic_ack();
extern void __freeze();
void
freeze()
{
struct ihk_os_cpu_monitor *monitor = cpu_local_var(monitor);
monitor->status_bak = monitor->status;
monitor->status = IHK_OS_MONITOR_KERNEL_FROZEN;
while (monitor->status == IHK_OS_MONITOR_KERNEL_FROZEN)
cpu_halt();
monitor->status = monitor->status_bak;
}
long
freeze_thaw(void *nmi_ctx)
{
struct ihk_os_cpu_monitor *monitor = cpu_local_var(monitor);
if (nmi_mode == 1) {
if (monitor->status != IHK_OS_MONITOR_KERNEL_FROZEN) {
#if 1
mod_nmi_ctx(nmi_ctx, __freeze);
return 1;
#else
unsigned long flags;
flags = cpu_disable_interrupt_save();
monitor->status_bak = monitor->status;
monitor->status = IHK_OS_MONITOR_KERNEL_FROZEN;
lapic_ack();
while (monitor->status == IHK_OS_MONITOR_KERNEL_FROZEN)
cpu_halt();
monitor->status = monitor->status_bak;
cpu_restore_interrupt(flags);
#endif
}
}
else if(nmi_mode == 2) {
if (monitor->status == IHK_OS_MONITOR_KERNEL_FROZEN) {
monitor->status = IHK_OS_MONITOR_KERNEL_THAW;
}
}
return 0;
}

View File

@@ -248,9 +248,13 @@ static int cmpxchg_futex_value_locked(uint32_t __user *uaddr, uint32_t uval, uin
static int get_futex_value_locked(uint32_t *dest, uint32_t *from)
{
/* RIKEN: futexes are always on not swappable pages */
*dest = getint_user((int *)from);
/*
* Officially we should call:
* return getint_user((int *)dest, (int *)from);
*
* but McKernel on x86 can just access user-space.
*/
*dest = *(volatile uint32_t *)from;
return 0;
}
@@ -670,25 +674,32 @@ static uint64_t futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q
uint64_t timeout)
{
uint64_t time_remain = 0;
unsigned long irqstate;
struct thread *thread = cpu_local_var(current);
/*
* The task state is guaranteed to be set before another task can
* wake it. set_current_state() is implemented using set_mb() and
* queue_me() calls spin_unlock() upon completion, both serializing
* access to the hash list and forcing another memory barrier.
* wake it.
* queue_me() calls spin_unlock() upon completion, serializing
* access to the hash list and forcing a memory barrier.
*/
xchg4(&(cpu_local_var(current)->status), PS_INTERRUPTIBLE);
/* Indicate spin sleep */
irqstate = ihk_mc_spinlock_lock(&thread->spin_sleep_lock);
thread->spin_sleep = 1;
ihk_mc_spinlock_unlock(&thread->spin_sleep_lock, irqstate);
queue_me(q, hb);
if (!plist_node_empty(&q->list)) {
/* RIKEN: use mcos timers */
if (timeout) {
dkprintf("futex_wait_queue_me(): tid: %d schedule_timeout()\n", cpu_local_var(current)->tid);
time_remain = schedule_timeout(timeout);
}
else {
dkprintf("futex_wait_queue_me(): tid: %d schedule()\n", cpu_local_var(current)->tid);
schedule();
spin_sleep_or_schedule();
time_remain = 0;
}
@@ -697,6 +708,7 @@ static uint64_t futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q
/* This does not need to be serialized */
cpu_local_var(current)->status = PS_RUNNING;
thread->spin_sleep = 0;
return time_remain;
}
@@ -743,14 +755,17 @@ static int futex_wait_setup(uint32_t __user *uaddr, uint32_t val, int fshared,
*/
q->key = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr, fshared, &q->key);
if ((ret != 0))
if (ret != 0)
return ret;
*hb = queue_lock(q);
ret = get_futex_value_locked(&uval, uaddr);
/* RIKEN: get_futex_value_locked() always returns 0 on mckernel */
if (ret) {
queue_unlock(q, *hb);
put_futex_key(fshared, &q->key);
return ret;
}
if (uval != val) {
queue_unlock(q, *hb);
@@ -773,11 +788,18 @@ static int futex_wait(uint32_t __user *uaddr, int fshared,
if (!bitset)
return -EINVAL;
#ifdef PROFILE_ENABLE
if (cpu_local_var(current)->profile &&
cpu_local_var(current)->profile_start_ts) {
cpu_local_var(current)->profile_elapsed_ts +=
(rdtsc() - cpu_local_var(current)->profile_start_ts);
cpu_local_var(current)->profile_start_ts = 0;
}
#endif
q.bitset = bitset;
q.requeue_pi_key = NULL;
/* RIKEN: futex_wait_queue_me() calls schedule_timeout() if timer is set */
retry:
/* Prepare to wait on uaddr. */
ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
@@ -809,6 +831,11 @@ retry:
out_put_key:
put_futex_key(fshared, &q.key);
out:
#ifdef PROFILE_ENABLE
if (cpu_local_var(current)->profile) {
cpu_local_var(current)->profile_start_ts = rdtsc();
}
#endif
return ret;
}

View File

@@ -23,14 +23,15 @@
#include <ihk/debug.h>
#include <ihk/ikc.h>
#include <ikc/master.h>
#include <syscall.h>
#include <cls.h>
#include <syscall.h>
#include <process.h>
#include <page.h>
#include <mman.h>
#include <init.h>
#include <kmalloc.h>
#include <sysfs.h>
#include <ihk/perfctr.h>
//#define DEBUG_PRINT_HOST
@@ -40,6 +41,9 @@
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#endif
/* Linux channel table, indexec by Linux CPU id */
static struct ihk_ikc_channel_desc **ikc2linuxs = NULL;
void check_mapping_for_proc(struct thread *thread, unsigned long addr)
{
unsigned long __phys;
@@ -87,11 +91,15 @@ int prepare_process_ranges_args_envs(struct thread *thread,
struct address_space *as = vm->address_space;
long aout_base;
int error;
struct vm_range *range;
unsigned long ap_flags;
enum ihk_mc_pt_attribute ptattr;
n = p->num_sections;
aout_base = (pn->reloc)? vm->region.map_end: 0;
for (i = 0; i < n; i++) {
ap_flags = 0;
if (pn->sections[i].interp && (interp_nbase == (uintptr_t)-1)) {
interp_obase = pn->sections[i].vaddr;
interp_obase -= (interp_obase % pn->interp_align);
@@ -112,48 +120,51 @@ int prepare_process_ranges_args_envs(struct thread *thread,
s = (pn->sections[i].vaddr) & PAGE_MASK;
e = (pn->sections[i].vaddr + pn->sections[i].len
+ PAGE_SIZE - 1) & PAGE_MASK;
range_npages = (e - s) >> PAGE_SHIFT;
range_npages = ((pn->sections[i].vaddr - s) +
pn->sections[i].filesz + PAGE_SIZE - 1) >> PAGE_SHIFT;
flags = VR_NONE;
flags |= PROT_TO_VR_FLAG(pn->sections[i].prot);
flags |= VRFLAG_PROT_TO_MAXPROT(flags);
flags |= VR_DEMAND_PAGING;
if ((up_v = ihk_mc_alloc_pages(range_npages, IHK_MC_AP_NOWAIT))
== NULL) {
kprintf("ERROR: alloc pages for ELF section %i\n", i);
goto err;
}
up = virt_to_phys(up_v);
if (add_process_memory_range(vm, s, e, up, flags, NULL, 0,
PAGE_SHIFT) != 0) {
ihk_mc_free_pages(up_v, range_npages);
/* Non-TEXT sections that are large respect user allocation policy
* unless user explicitly requests otherwise */
if (i >= 1 && pn->sections[i].len >= pn->mpol_threshold &&
!(pn->mpol_flags & MPOL_NO_BSS)) {
dkprintf("%s: section: %d size: %d pages -> IHK_MC_AP_USER\n",
__FUNCTION__, i, range_npages);
ap_flags = IHK_MC_AP_USER;
flags |= VR_AP_USER;
}
if (add_process_memory_range(vm, s, e, NOPHYS, flags, NULL, 0,
pn->sections[i].len > LARGE_PAGE_SIZE ?
LARGE_PAGE_SHIFT : PAGE_SHIFT,
&range) != 0) {
kprintf("ERROR: adding memory range for ELF section %i\n", i);
goto err;
}
{
void *_virt = (void *)s;
unsigned long _phys;
if (ihk_mc_pt_virt_to_phys(as->page_table,
_virt, &_phys)) {
kprintf("ERROR: no mapping for 0x%lX\n", _virt);
}
for (_virt = (void *)s + PAGE_SIZE;
(unsigned long)_virt < e; _virt += PAGE_SIZE) {
unsigned long __phys;
if (ihk_mc_pt_virt_to_phys(as->page_table,
_virt, &__phys)) {
kprintf("ERROR: no mapping for 0x%lX\n", _virt);
panic("mapping");
}
if (__phys != _phys + PAGE_SIZE) {
kprintf("0x%lX + PAGE_SIZE is not physically contigous, from 0x%lX to 0x%lX\n", _virt - PAGE_SIZE, _phys, __phys);
panic("mondai");
}
if ((up_v = ihk_mc_alloc_pages_user(range_npages,
IHK_MC_AP_NOWAIT | ap_flags)) == NULL) {
kprintf("ERROR: alloc pages for ELF section %i\n", i);
goto err;
}
_phys = __phys;
}
dkprintf("0x%lX -> 0x%lX is physically contigous\n", s, e);
up = virt_to_phys(up_v);
ptattr = arch_vrflag_to_ptattr(range->flag, PF_POPULATE, NULL);
error = ihk_mc_pt_set_range(vm->address_space->page_table, vm,
(void *)range->start,
(void *)range->start + (range_npages * PAGE_SIZE),
up, ptattr,
range->pgshift);
if (error) {
kprintf("%s: ihk_mc_pt_set_range failed. %d\n",
__FUNCTION__, error);
ihk_mc_free_pages_user(up_v, range_npages);
goto err;
}
p->sections[i].remote_pa = up;
@@ -198,7 +209,43 @@ int prepare_process_ranges_args_envs(struct thread *thread,
pn->at_entry += aout_base;
}
vm->region.brk_start = vm->region.brk_end = vm->region.data_end;
vm->region.brk_start = vm->region.brk_end =
(vm->region.data_end + LARGE_PAGE_SIZE - 1) & LARGE_PAGE_MASK;
#if 0
{
void *heap;
dkprintf("%s: requested heap size: %lu\n",
__FUNCTION__, proc->heap_extension);
heap = ihk_mc_alloc_aligned_pages(proc->heap_extension >> PAGE_SHIFT,
LARGE_PAGE_P2ALIGN, IHK_MC_AP_NOWAIT |
(!(proc->mpol_flags & MPOL_NO_HEAP) ? IHK_MC_AP_USER : 0));
if (!heap) {
kprintf("%s: error: allocating heap\n", __FUNCTION__);
goto err;
}
flags = VR_PROT_READ | VR_PROT_WRITE;
flags |= VRFLAG_PROT_TO_MAXPROT(flags);
if (add_process_memory_range(vm, vm->region.brk_start,
vm->region.brk_start + proc->heap_extension,
virt_to_phys(heap),
flags, NULL, 0, LARGE_PAGE_P2ALIGN, NULL) != 0) {
ihk_mc_free_pages(heap, proc->heap_extension >> PAGE_SHIFT);
kprintf("%s: error: adding memory range for heap\n", __FUNCTION__);
goto err;
}
vm->region.brk_end_allocated = vm->region.brk_end +
proc->heap_extension;
dkprintf("%s: heap @ 0x%lx:%lu\n",
__FUNCTION__, vm->region.brk_start, proc->heap_extension);
}
#else
vm->region.brk_end_allocated = vm->region.brk_end;
#endif
/* Map, copy and update args and envs */
flags = VR_PROT_READ | VR_PROT_WRITE;
@@ -206,15 +253,16 @@ int prepare_process_ranges_args_envs(struct thread *thread,
addr = vm->region.map_start - PAGE_SIZE * SCD_RESERVED_COUNT;
e = addr + PAGE_SIZE * ARGENV_PAGE_COUNT;
if((args_envs = ihk_mc_alloc_pages(ARGENV_PAGE_COUNT, IHK_MC_AP_NOWAIT)) == NULL){
if((args_envs = ihk_mc_alloc_pages_user(ARGENV_PAGE_COUNT,
IHK_MC_AP_NOWAIT)) == NULL){
kprintf("ERROR: allocating pages for args/envs\n");
goto err;
}
args_envs_p = virt_to_phys(args_envs);
if(add_process_memory_range(vm, addr, e, args_envs_p,
flags, NULL, 0, PAGE_SHIFT) != 0){
ihk_mc_free_pages(args_envs, ARGENV_PAGE_COUNT);
flags, NULL, 0, PAGE_SHIFT, NULL) != 0){
ihk_mc_free_pages_user(args_envs, ARGENV_PAGE_COUNT);
kprintf("ERROR: adding memory range for args/envs\n");
goto err;
}
@@ -393,7 +441,9 @@ static int process_msg_prepare_process(unsigned long rphys)
memcpy_long(pn, p, sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * n);
if((thread = create_thread(p->entry)) == NULL){
if ((thread = create_thread(p->entry,
(unsigned long *)&p->cpu_set,
sizeof(p->cpu_set))) == NULL) {
kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
@@ -414,6 +464,14 @@ static int process_msg_prepare_process(unsigned long rphys)
proc->sgid = pn->cred[6];
proc->fsgid = pn->cred[7];
proc->termsig = SIGCHLD;
proc->mpol_flags = pn->mpol_flags;
proc->mpol_threshold = pn->mpol_threshold;
proc->nr_processes = pn->nr_processes;
proc->heap_extension = pn->heap_extension;
#ifdef PROFILE_ENABLE
proc->profile = pn->profile;
thread->profile = pn->profile;
#endif
vm->region.user_start = pn->user_start;
vm->region.user_end = pn->user_end;
@@ -432,9 +490,6 @@ static int process_msg_prepare_process(unsigned long rphys)
vm->region.map_end = vm->region.map_start;
memcpy(proc->rlimit, pn->rlimit, sizeof(struct rlimit) * MCK_RLIM_MAX);
/* TODO: Clear it at the proper timing */
cpu_local_var(scp).post_idx = 0;
if (prepare_process_ranges_args_envs(thread, pn, p, attr,
NULL, 0, NULL, 0) != 0) {
kprintf("error: preparing process ranges, args, envs, stack\n");
@@ -459,70 +514,6 @@ err:
return -ENOMEM;
}
static void process_msg_init(struct ikc_scd_init_param *pcp, struct syscall_params *lparam)
{
lparam->response_va = ihk_mc_alloc_pages(RESPONSE_PAGE_COUNT, 0);
lparam->response_pa = virt_to_phys(lparam->response_va);
pcp->request_page = 0;
pcp->doorbell_page = 0;
pcp->response_page = lparam->response_pa;
}
static void process_msg_init_acked(struct ihk_ikc_channel_desc *c, unsigned long pphys)
{
struct ikc_scd_init_param *param = phys_to_virt(pphys);
struct syscall_params *lparam;
enum ihk_mc_pt_attribute attr;
attr = PTATTR_NO_EXECUTE | PTATTR_WRITABLE | PTATTR_FOR_USER;
lparam = &cpu_local_var(scp);
if(cpu_local_var(syscall_channel2) == c)
lparam = &cpu_local_var(scp2);
lparam->request_rpa = param->request_page;
lparam->request_pa = ihk_mc_map_memory(NULL, param->request_page,
REQUEST_PAGE_COUNT * PAGE_SIZE);
if((lparam->request_va = ihk_mc_map_virtual(lparam->request_pa,
REQUEST_PAGE_COUNT,
attr)) == NULL){
// TODO:
panic("ENOMEM");
}
lparam->doorbell_rpa = param->doorbell_page;
lparam->doorbell_pa = ihk_mc_map_memory(NULL, param->doorbell_page,
DOORBELL_PAGE_COUNT *
PAGE_SIZE);
if((lparam->doorbell_va = ihk_mc_map_virtual(lparam->doorbell_pa,
DOORBELL_PAGE_COUNT,
attr)) == NULL){
// TODO:
panic("ENOMEM");
}
lparam->post_rpa = param->post_page;
lparam->post_pa = ihk_mc_map_memory(NULL, param->post_page,
PAGE_SIZE);
if((lparam->post_va = ihk_mc_map_virtual(lparam->post_pa, 1,
attr)) == NULL){
// TODO:
panic("ENOMEM");
}
lparam->post_fin = 1;
dkprintf("Syscall parameters: (%d)\n", ihk_mc_get_processor_id());
dkprintf(" Response: %lx, %p\n",
lparam->response_pa, lparam->response_va);
dkprintf(" Request : %lx, %lx, %p\n",
lparam->request_pa, lparam->request_rpa, lparam->request_va);
dkprintf(" Doorbell: %lx, %lx, %p\n",
lparam->doorbell_pa, lparam->doorbell_rpa, lparam->doorbell_va);
dkprintf(" Post: %lx, %lx, %p\n",
lparam->post_pa, lparam->post_rpa, lparam->post_va);
}
static void syscall_channel_send(struct ihk_ikc_channel_desc *c,
struct ikc_scd_packet *packet)
{
@@ -530,40 +521,16 @@ static void syscall_channel_send(struct ihk_ikc_channel_desc *c,
}
extern unsigned long do_kill(struct thread *, int, int, int, struct siginfo *, int ptracecont);
extern void process_procfs_request(unsigned long rarg);
extern void process_procfs_request(struct ikc_scd_packet *rpacket);
extern void terminate_host(int pid);
extern void debug_log(long);
static void req_get_cpu_mapping(long req_rpa)
{
size_t mapsize;
size_t size;
int npages;
long phys;
struct get_cpu_mapping_req *req;
struct cpu_mapping *buf;
size = sizeof(*req);
mapsize = size + (req_rpa & (PAGE_SIZE - 1));
npages = (mapsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
phys = ihk_mc_map_memory(NULL, req_rpa, size);
req = ihk_mc_map_virtual(phys, npages, PTATTR_WRITABLE);
req->error = arch_get_cpu_mapping(&buf, &req->buf_elems);
if (!req->error) {
req->buf_rpa = virt_to_phys(buf);
}
ihk_mc_unmap_virtual(req, npages, 0);
ihk_mc_unmap_memory(NULL, phys, size);
return;
} /* req_get_cpu_mapping() */
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
void *__packet, void *ihk_os)
{
struct ikc_scd_packet *packet = __packet;
struct ikc_scd_packet pckt;
struct ihk_ikc_channel_desc *resp_channel = cpu_local_var(ikc2linux);
int rc;
struct mcs_rwlock_node_irqsave lock;
struct thread *thread;
@@ -578,11 +545,12 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
unsigned long pp;
int cpuid;
int ret = 0;
struct perf_ctrl_desc *pcd;
unsigned int mode = 0;
switch (packet->msg) {
case SCD_MSG_INIT_CHANNEL_ACKED:
dkprintf("SCD_MSG_INIT_CHANNEL_ACKED\n");
process_msg_init_acked(c, packet->arg);
ret = 0;
break;
@@ -598,30 +566,30 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
}
pckt.ref = packet->ref;
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
syscall_channel_send(resp_channel, &pckt);
ret = 0;
break;
case SCD_MSG_SCHEDULE_PROCESS:
cpuid = obtain_clone_cpuid();
if(cpuid == -1){
thread = (struct thread *)packet->arg;
cpuid = obtain_clone_cpuid(&thread->cpu_set);
if (cpuid == -1) {
kprintf("No CPU available\n");
ret = -1;
break;
}
dkprintf("SCD_MSG_SCHEDULE_PROCESS: %lx\n", packet->arg);
thread = (struct thread *)packet->arg;
proc = thread->proc;
settid(thread, 0, cpuid, -1, 0, NULL);
dkprintf("SCD_MSG_SCHEDULE_PROCESS: %lx\n", packet->arg);
proc = thread->proc;
thread->tid = proc->pid;
proc->status = PS_RUNNING;
thread->status = PS_RUNNING;
chain_thread(thread);
chain_process(proc);
runq_add_thread(thread, cpuid);
//cpu_local_var(next) = (struct thread *)packet->arg;
ret = 0;
break;
@@ -655,15 +623,15 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
pckt.err = 0;
pckt.ref = packet->ref;
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
syscall_channel_send(resp_channel, &pckt);
rc = do_kill(NULL, info.pid, info.tid, info.sig, &info.info, 0);
kprintf("SCD_MSG_SEND_SIGNAL: do_kill(pid=%d, tid=%d, sig=%d)=%d\n", info.pid, info.tid, info.sig, rc);
dkprintf("SCD_MSG_SEND_SIGNAL: do_kill(pid=%d, tid=%d, sig=%d)=%d\n", info.pid, info.tid, info.sig, rc);
ret = 0;
break;
case SCD_MSG_PROCFS_REQUEST:
process_procfs_request(packet->arg);
process_procfs_request(packet);
ret = 0;
break;
@@ -688,15 +656,61 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
ret = 0;
break;
case SCD_MSG_GET_CPU_MAPPING:
req_get_cpu_mapping(packet->arg);
case SCD_MSG_PERF_CTRL:
pp = ihk_mc_map_memory(NULL, packet->arg, sizeof(struct perf_ctrl_desc));
pcd = (struct perf_ctrl_desc *)ihk_mc_map_virtual(pp, 1, PTATTR_WRITABLE | PTATTR_ACTIVE);
pckt.msg = SCD_MSG_REPLY_GET_CPU_MAPPING;
switch (pcd->ctrl_type) {
case PERF_CTRL_SET:
if (!pcd->exclude_kernel) {
mode |= PERFCTR_KERNEL_MODE;
}
if (!pcd->exclude_user) {
mode |= PERFCTR_USER_MODE;
}
ihk_mc_perfctr_init_raw(pcd->target_cntr, pcd->config, mode);
ihk_mc_perfctr_stop(1 << pcd->target_cntr);
ihk_mc_perfctr_reset(pcd->target_cntr);
break;
case PERF_CTRL_ENABLE:
ihk_mc_perfctr_start(pcd->target_cntr_mask);
break;
case PERF_CTRL_DISABLE:
ihk_mc_perfctr_stop(pcd->target_cntr_mask);
break;
case PERF_CTRL_GET:
pcd->read_value = ihk_mc_perfctr_read(pcd->target_cntr);
break;
default:
kprintf("%s: SCD_MSG_PERF_CTRL unexpected ctrl_type\n", __FUNCTION__);
}
ihk_mc_unmap_virtual(pcd, 1, 0);
ihk_mc_unmap_memory(NULL, pp, sizeof(struct perf_ctrl_desc));
pckt.msg = SCD_MSG_PERF_ACK;
pckt.err = 0;
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
ihk_ikc_send(resp_channel, &pckt, 0);
ret = 0;
break;
case SCD_MSG_CPU_RW_REG:
pckt.msg = SCD_MSG_CPU_RW_REG_RESP;
memcpy(&pckt.desc, &packet->desc,
sizeof(struct ihk_os_cpu_register));
pckt.resp = packet->resp;
pckt.err = arch_cpu_read_write_register(&pckt.desc, packet->op);
ihk_ikc_send(resp_channel, &pckt, 0);
break;
default:
kprintf("syscall_pakcet_handler:unknown message "
"(%d.%d.%d.%d.%d.%#lx)\n",
@@ -711,56 +725,77 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
return ret;
}
void init_host_syscall_channel(void)
static int dummy_packet_handler(struct ihk_ikc_channel_desc *c,
void *__packet, void *__os)
{
struct ihk_ikc_connect_param param;
struct ikc_scd_packet pckt;
param.port = 501;
param.pkt_size = sizeof(struct ikc_scd_packet);
param.queue_size = PAGE_SIZE;
param.magic = 0x1129;
param.handler = syscall_packet_handler;
dkprintf("(syscall) Trying to connect host ...");
while (ihk_ikc_connect(NULL, &param) != 0) {
dkprintf(".");
ihk_mc_delay_us(1000 * 1000);
}
dkprintf("connected.\n");
get_this_cpu_local_var()->syscall_channel = param.channel;
process_msg_init(&cpu_local_var(iip), &cpu_local_var(scp));
pckt.msg = SCD_MSG_INIT_CHANNEL;
pckt.ref = ihk_mc_get_processor_id();
pckt.arg = virt_to_phys(&cpu_local_var(iip));
syscall_channel_send(param.channel, &pckt);
struct ikc_scd_packet *packet = __packet;
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet, c);
return 0;
}
void init_host_syscall_channel2(void)
void init_host_ikc2linux(int linux_cpu)
{
struct ihk_ikc_connect_param param;
struct ikc_scd_packet pckt;
struct ihk_ikc_channel_desc *c;
param.port = 502;
/* Main thread allocates channel pointer table */
if (!ikc2linuxs) {
ikc2linuxs = kmalloc(sizeof(*ikc2linuxs) *
ihk_mc_get_nr_linux_cores(), IHK_MC_AP_NOWAIT);
if (!ikc2linuxs) {
kprintf("%s: error: allocating Linux channels\n", __FUNCTION__);
panic("");
}
memset(ikc2linuxs, 0, sizeof(*ikc2linuxs) *
ihk_mc_get_nr_linux_cores());
}
c = ikc2linuxs[linux_cpu];
if (!c) {
param.port = 503;
param.intr_cpu = linux_cpu;
param.pkt_size = sizeof(struct ikc_scd_packet);
param.queue_size = 2 * num_processors * sizeof(struct ikc_scd_packet);
if (param.queue_size < PAGE_SIZE * 4) {
param.queue_size = PAGE_SIZE * 4;
}
param.magic = 0x1129;
param.handler = dummy_packet_handler;
dkprintf("(ikc2linux) Trying to connect host ...");
while (ihk_ikc_connect(NULL, &param) != 0) {
dkprintf(".");
ihk_mc_delay_us(1000 * 1000);
}
dkprintf("connected.\n");
ikc2linuxs[linux_cpu] = param.channel;
c = param.channel;
}
get_this_cpu_local_var()->ikc2linux = c;
}
void init_host_ikc2mckernel(void)
{
struct ihk_ikc_connect_param param;
param.port = 501;
param.intr_cpu = -1;
param.pkt_size = sizeof(struct ikc_scd_packet);
param.queue_size = PAGE_SIZE;
param.queue_size = PAGE_SIZE * 4;
param.magic = 0x1329;
param.handler = syscall_packet_handler;
dkprintf("(syscall) Trying to connect host ...");
dkprintf("(ikc2mckernel) Trying to connect host ...");
while (ihk_ikc_connect(NULL, &param) != 0) {
dkprintf(".");
ihk_mc_delay_us(1000 * 1000);
}
dkprintf("connected.\n");
get_this_cpu_local_var()->syscall_channel2 = param.channel;
process_msg_init(&cpu_local_var(iip2), &cpu_local_var(scp2));
pckt.msg = SCD_MSG_INIT_CHANNEL;
pckt.ref = ihk_mc_get_processor_id();
pckt.arg = virt_to_phys(&cpu_local_var(iip2));
syscall_channel_send(param.channel, &pckt);
ihk_ikc_set_regular_channel(NULL, param.channel, ihk_ikc_get_processor_id());
}

View File

@@ -38,6 +38,26 @@ extern ihk_spinlock_t cpu_status_lock;
#define CPU_FLAG_NEED_RESCHED 0x1U
#define CPU_FLAG_NEED_MIGRATE 0x2U
typedef int (*smp_func_t)(int cpu_index, int nr_cpus, void *arg);
int smp_call_func(cpu_set_t *__cpu_set, smp_func_t __func, void *__arg);
struct smp_func_call_data {
/* XXX: Sync MCS lock to avoid contention on counter */
// mcs_lock_node_t lock;
int nr_cpus;
ihk_atomic_t cpus_left;
smp_func_t func;
void *arg;
};
struct smp_func_call_request {
struct smp_func_call_data *sfcd;
int cpu_index;
int ret;
struct list_head list;
};
struct cpu_local_var {
/* malloc */
struct list_head free_list;
@@ -55,13 +75,8 @@ struct cpu_local_var {
struct list_head runq;
size_t runq_len;
struct ihk_ikc_channel_desc *syscall_channel;
struct syscall_params scp;
struct ikc_scd_init_param iip;
struct ihk_ikc_channel_desc *ikc2linux;
struct ihk_ikc_channel_desc *syscall_channel2;
struct syscall_params scp2;
struct ikc_scd_init_param iip2;
struct resource_set *resource_set;
int status;
@@ -77,6 +92,10 @@ struct cpu_local_var {
int no_preempt;
int timer_enabled;
int kmalloc_initialized;
struct ihk_os_cpu_monitor *monitor;
ihk_spinlock_t smp_func_req_lock;
struct list_head smp_func_req_list;
} __attribute__((aligned(64)));

View File

@@ -16,7 +16,7 @@
extern void arch_init(void);
extern void kmsg_init(int);
extern void mem_init(void);
extern void ikc_master_init(void);
extern void ihk_ikc_master_init(void);
extern void ap_init(void);
extern void arch_ready(void);
extern void mc_ikc_test_init(void);
@@ -24,12 +24,18 @@ extern void cpu_local_var_init(void);
extern void kmalloc_init(void);
extern void ap_start(void);
extern void ihk_mc_dma_init(void);
extern void init_host_syscall_channel(void);
extern void init_host_syscall_channel2(void);
extern void init_host_ikc2linux(int linux_cpu);
extern void init_host_ikc2mckernel(void);
//extern void set_ikc2linux_to_local(int linux_cpu);
extern void sched_init(void);
extern void pc_ap_init(void);
extern void cpu_sysfs_setup(void);
extern void numa_sysfs_setup(void);
extern void rusage_sysfs_setup(void);
extern void status_sysfs_setup(void);
extern char *find_command_line(char *name);
extern int num_processors;
#endif

View File

@@ -28,9 +28,9 @@ r;\
})
#define kfree(ptr) _kfree(ptr, __FILE__, __LINE__)
#define memcheck(ptr, msg) _memcheck(ptr, msg, __FILE__, __LINE__, 0)
void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line);
void *_kmalloc(int size, ihk_mc_ap_flag flag, char *file, int line);
void _kfree(void *ptr, char *file, int line);
void *__kmalloc(int size, enum ihk_mc_ap_flag flag);
void *__kmalloc(int size, ihk_mc_ap_flag flag);
void __kfree(void *ptr);
int _memcheck(void *ptr, char *msg, char *file, int line, int free);

View File

@@ -32,13 +32,28 @@ enum {
MF_HAS_PAGER = 0x0001,
MF_SHMDT_OK = 0x0002,
MF_IS_REMOVABLE = 0x0004,
MF_PREFETCH = 0x0008,
MF_ZEROFILL = 0x0010,
MF_REG_FILE = 0x1000,
MF_DEV_FILE = 0x2000,
MF_PREMAP = 0x8000,
MF_HOST_RELEASED = 0x80000000,
MF_END
};
#define MEMOBJ_READY 0
#define MEMOBJ_TO_BE_PREFETCHED 1
struct memobj {
struct memobj_ops * ops;
uint32_t flags;
int8_t padding[4];
ihk_spinlock_t lock;
struct memobj_ops *ops;
uint32_t flags;
uint32_t status;
size_t size;
ihk_spinlock_t lock;
/* For pre-mapped memobjects */
void **pages;
int nr_pages;
};
typedef void memobj_release_func_t(struct memobj *obj);

View File

@@ -17,8 +17,9 @@
struct page {
struct list_head list;
struct list_head hash;
uint8_t mode;
uint8_t padding[3];
uint64_t phys;
ihk_atomic_t count;
off_t offset;
};
@@ -38,9 +39,8 @@ enum page_mode {
struct page *phys_to_page(uintptr_t phys);
uintptr_t page_to_phys(struct page *page);
int page_unmap(struct page *page);
struct page *phys_to_page_insert_hash(uint64_t phys);
void *allocate_pages(int npages, enum ihk_mc_ap_flag flag);
void free_pages(void *va, int npages);
void begin_free_pages_pending(void);
void finish_free_pages_pending(void);

View File

@@ -30,7 +30,8 @@ enum pager_op {
struct pager_create_result {
uintptr_t handle;
int maxprot;
int8_t padding[4];
uint32_t flags;
size_t size;
};
/*

View File

@@ -22,10 +22,13 @@
#include <memobj.h>
#include <affinity.h>
#include <syscall.h>
#include <bitops.h>
#include <profile.h>
#define VR_NONE 0x0
#define VR_STACK 0x1
#define VR_RESERVED 0x2
#define VR_AP_USER 0x4
#define VR_IO_NOCACHE 0x100
#define VR_REMOTE 0x200
#define VR_WRITE_COMBINED 0x400
@@ -165,6 +168,73 @@
#define NOPHYS ((uintptr_t)-1)
#define PROCESS_NUMA_MASK_BITS 256
/*
* Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
* passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
* The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
*/
/* Policies */
enum {
MPOL_DEFAULT,
MPOL_PREFERRED,
MPOL_BIND,
MPOL_INTERLEAVE,
MPOL_LOCAL,
MPOL_MAX, /* always last member of enum */
};
enum mpol_rebind_step {
MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */
MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */
MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/
MPOL_REBIND_NSTEP,
};
/* Flags for set_mempolicy */
#define MPOL_F_STATIC_NODES (1 << 15)
#define MPOL_F_RELATIVE_NODES (1 << 14)
/*
* MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
* either set_mempolicy() or mbind().
*/
#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
/* Flags for get_mempolicy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
#define MPOL_F_ADDR (1<<1) /* look up vma using address */
#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
/* Flags for mbind */
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
to policy */
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
#define MPOL_MF_VALID (MPOL_MF_STRICT | \
MPOL_MF_MOVE | \
MPOL_MF_MOVE_ALL)
/*
* Internal flags that share the struct mempolicy flags word with
* "mode flags". These flags are allocated from bit 0 up, as they
* are never OR'ed into the mode in mempolicy API arguments.
*/
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */
#define SPAWN_TO_LOCAL 0
#define SPAWN_TO_REMOTE 1
#define SPAWNING_TO_REMOTE 1001
#include <waitq.h>
#include <futex.h>
@@ -178,6 +248,7 @@ struct process_vm;
struct vm_regions;
struct vm_range;
#define HASH_SIZE 73
struct resource_set {
@@ -303,13 +374,21 @@ struct vm_range {
off_t objoff;
int pgshift; /* page size. 0 means THP */
int padding;
void *private_data;
};
struct vm_range_numa_policy {
struct list_head list;
unsigned long start, end;
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
int numa_mem_policy;
};
struct vm_regions {
unsigned long vm_start, vm_end;
unsigned long text_start, text_end;
unsigned long data_start, data_end;
unsigned long brk_start, brk_end;
unsigned long brk_start, brk_end, brk_end_allocated;
unsigned long map_start, map_end;
unsigned long stack_start, stack_end;
unsigned long user_start, user_end;
@@ -334,7 +413,7 @@ struct mckfd {
#define SFD_NONBLOCK 04000
struct sig_common {
ihk_spinlock_t lock;
mcs_rwlock_lock_t lock;
ihk_atomic_t use;
struct k_sigaction action[_NSIG];
struct list_head sigpending;
@@ -395,7 +474,7 @@ struct process {
// V +---- |
// PS_STOPPED -----+
// (PS_TRACED)
int exit_status;
int exit_status; // only for zombie
/* Store exit_status for a group of threads when stopped by SIGSTOP.
exit_status can't be used because values of exit_status of threads
@@ -425,6 +504,7 @@ struct process {
unsigned long saved_auxv[AUXV_LEN];
char *saved_cmdline;
long saved_cmdline_len;
cpu_set_t cpu_set;
/* Store ptrace flags.
* The lower 8 bits are PTRACE_O_xxx of the PTRACE_SETOPTIONS request.
@@ -458,6 +538,10 @@ struct process {
long maxrss;
long maxrss_children;
/* Memory policy flags and memory specific options */
unsigned long mpol_flags;
size_t mpol_threshold;
unsigned long heap_extension;
// perf_event
int perf_status;
@@ -466,6 +550,13 @@ struct process {
#define PP_COUNT 2
#define PP_STOP 3
struct mc_perf_event *monitoring_event;
#ifdef PROFILE_ENABLE
int profile;
mcs_lock_node_t profile_lock;
struct profile_event *profile_events;
unsigned long profile_elapsed_ts;
#endif // PROFILE_ENABLE
int nr_processes; /* For partitioned execution */
};
void hold_thread(struct thread *ftn);
@@ -507,6 +598,7 @@ struct thread {
// PS_TRACED
// PS_INTERRPUTIBLE
// PS_UNINTERRUPTIBLE
int exit_status;
// process vm
struct process_vm *vm;
@@ -537,12 +629,19 @@ struct thread {
fp_regs_struct *fp_regs;
int in_syscall_offload;
#ifdef PROFILE_ENABLE
int profile;
struct profile_event *profile_events;
unsigned long profile_start_ts;
unsigned long profile_elapsed_ts;
#endif // PROFILE_ENABLE
// signal
struct sig_common *sigcommon;
sigset_t sigmask;
stack_t sigstack;
struct list_head sigpending;
ihk_spinlock_t sigpendinglock;
mcs_rwlock_lock_t sigpendinglock;
volatile int sigevent;
// gpio
@@ -555,9 +654,14 @@ struct thread {
struct sig_pending *ptrace_sendsig;
// cpu time
/*
struct timespec stime;
struct timespec utime;
struct timespec btime;
*/
unsigned long system_tsc;
unsigned long user_tsc;
unsigned long base_tsc;
int times_update;
int in_kernel;
@@ -570,8 +674,15 @@ struct thread {
/* Syscall offload wait queue head */
struct waitq scd_wq;
int thread_offloaded;
int mod_clone;
struct uti_attr *mod_clone_arg;
int parent_cpuid;
};
#define VM_RANGE_CACHE_SIZE 4
struct process_vm {
struct address_space *address_space;
struct list_head vm_range_list;
@@ -594,6 +705,12 @@ struct process_vm {
int exiting;
long currss;
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
int numa_mem_policy;
/* Protected by memory_range_lock */
struct list_head vm_range_numa_policy_list;
struct vm_range *range_cache[VM_RANGE_CACHE_SIZE];
int range_cache_ind;
};
static inline int has_cap_ipc_lock(struct thread *th)
@@ -610,7 +727,8 @@ static inline int has_cap_sys_admin(struct thread *th)
void hold_address_space(struct address_space *);
void release_address_space(struct address_space *);
struct thread *create_thread(unsigned long user_pc);
struct thread *create_thread(unsigned long user_pc,
unsigned long *__cpu_set, size_t cpu_set_size);
struct thread *clone_thread(struct thread *org, unsigned long pc,
unsigned long sp, int clone_flags);
void destroy_thread(struct thread *thread);
@@ -625,9 +743,10 @@ void free_process_memory_ranges(struct process_vm *vm);
int populate_process_memory(struct process_vm *vm, void *start, size_t len);
int add_process_memory_range(struct process_vm *vm,
unsigned long start, unsigned long end,
unsigned long phys, unsigned long flag,
struct memobj *memobj, off_t objoff, int pgshift);
unsigned long start, unsigned long end,
unsigned long phys, unsigned long flag,
struct memobj *memobj, off_t offset,
int pgshift, struct vm_range **rp);
int remove_process_memory_range(struct process_vm *vm, unsigned long start,
unsigned long end, int *ro_freedp);
int split_process_memory_range(struct process_vm *vm,
@@ -661,15 +780,17 @@ int init_process_stack(struct thread *thread, struct program_load_desc *pn,
int argc, char **argv,
int envc, char **env);
unsigned long extend_process_region(struct process_vm *vm,
unsigned long start, unsigned long end,
unsigned long address, unsigned long flag);
unsigned long end_allocated,
unsigned long address, unsigned long flag);
extern enum ihk_mc_pt_attribute arch_vrflag_to_ptattr(unsigned long flag, uint64_t fault, pte_t *ptep);
enum ihk_mc_pt_attribute common_vrflag_to_ptattr(unsigned long flag, uint64_t fault, pte_t *ptep);
void schedule(void);
void spin_sleep_or_schedule(void);
void runq_add_thread(struct thread *thread, int cpu_id);
void runq_del_thread(struct thread *thread, int cpu_id);
int sched_wakeup_thread(struct thread *thread, int valid_states);
int sched_wakeup_thread_locked(struct thread *thread, int valid_states);
void sched_request_migrate(int cpu_id, struct thread *thread);
void check_need_resched(void);
@@ -690,7 +811,5 @@ void chain_thread(struct thread *);
void proc_init();
void set_timer();
struct sig_pending *hassigpending(struct thread *thread);
void settid(struct thread *thread, int mode, int newcpuid, int oldcpuid,
int nr_tids, int *tids);
#endif

66
kernel/include/profile.h Normal file
View File

@@ -0,0 +1,66 @@
#ifndef __PROCESS_PROFILE_H_
#define __PROCESS_PROFILE_H_
/* Uncomment this to enable profiling */
#define PROFILE_ENABLE
#ifdef PROFILE_ENABLE
#define PROFILE_SYSCALL_MAX 300
#define PROFILE_OFFLOAD_MAX (PROFILE_SYSCALL_MAX << 1)
#define PROFILE_EVENT_MIN PROFILE_OFFLOAD_MAX
#define __NR_profile 701
#define PROF_JOB 0x40000000
#define PROF_PROC 0x80000000
#define PROF_CLEAR 0x01
#define PROF_ON 0x02
#define PROF_OFF 0x04
#define PROF_PRINT 0x08
struct profile_event {
uint32_t cnt;
uint64_t tsc;
};
/*
* The layout of profile events is as follows:
* [0,PROFILE_SYSCALL_MAX) - syscalls
* [PROFILE_SYSCALL_MAX,PROFILE_OFFLOAD_MAX) - syscall offloads
* [PROFILE_OFFLOAD_MAX,PROFILE_EVENT_MAX) - general events
*
* XXX: Make sure to fill in prof_event_names in profile.c
* for each added profiled event.
*/
enum profile_event_type {
PROFILE_tlb_invalidate = PROFILE_EVENT_MIN,
PROFILE_page_fault,
PROFILE_page_fault_anon_clr,
PROFILE_page_fault_file,
PROFILE_page_fault_dev_file,
PROFILE_page_fault_file_clr,
PROFILE_mpol_alloc_missed,
PROFILE_mmap_anon_contig_phys,
PROFILE_mmap_anon_no_contig_phys,
PROFILE_mmap_regular_file,
PROFILE_mmap_device_file,
PROFILE_EVENT_MAX /* Should be the last event type */
};
struct thread;
struct process;
enum profile_event_type profile_syscall2offload(enum profile_event_type sc);
void profile_event_add(enum profile_event_type type, uint64_t tsc);
void profile_print_thread_stats(struct thread *thread);
void profile_print_proc_stats(struct process *proc);
void profile_print_job_stats(struct process *proc);
void profile_accumulate_events(struct thread *thread, struct process *proc);
int profile_accumulate_and_print_job_events(struct process *proc);
int profile_alloc_events(struct thread *thread);
void profile_dealloc_thread_events(struct thread *thread);
void profile_dealloc_proc_events(struct process *proc);
#endif // PROFILE_ENABLE
#endif // __PROCESS_PROFILE_H_

109
kernel/include/rbtree.h Normal file
View File

@@ -0,0 +1,109 @@
/*
Red Black Trees
(C) 1999 Andrea Arcangeli <andrea@suse.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
linux/include/linux/rbtree.h
To use rbtrees you'll have to implement your own insert and search cores.
This will avoid us to use callbacks and to drop drammatically performances.
I know it's not the cleaner way, but in C (not in C++) to get
performances and genericity...
See Documentation/rbtree.txt for documentation and samples.
*/
#ifndef _LINUX_RBTREE_H
#define _LINUX_RBTREE_H
#include <ihk/types.h>
#include <lwk/compiler.h>
#include <lwk/stddef.h>
struct rb_node {
unsigned long __rb_parent_color;
struct rb_node *rb_right;
struct rb_node *rb_left;
} __attribute__((aligned(sizeof(long))));
/* The alignment might seem pointless, but allegedly CRIS needs it */
struct rb_root {
struct rb_node *rb_node;
};
#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3))
#define RB_ROOT (struct rb_root) { NULL, }
#define rb_entry(ptr, type, member) container_of(ptr, type, member)
#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
/* 'empty' nodes are nodes that are known not to be inserted in an rbree */
#define RB_EMPTY_NODE(node) \
((node)->__rb_parent_color == (unsigned long)(node))
#define RB_CLEAR_NODE(node) \
((node)->__rb_parent_color = (unsigned long)(node))
extern void rb_insert_color(struct rb_node *, struct rb_root *);
extern void rb_erase(struct rb_node *, struct rb_root *);
/* Find logical next and previous nodes in a tree */
extern struct rb_node *rb_next(const struct rb_node *);
extern struct rb_node *rb_prev(const struct rb_node *);
extern struct rb_node *rb_first(const struct rb_root *);
extern struct rb_node *rb_last(const struct rb_root *);
/* Postorder iteration - always visit the parent after its children */
extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);
/* Fast replacement of a single node without remove/rebalance/add/rebalance */
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
struct rb_root *root);
static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
struct rb_node ** rb_link)
{
node->__rb_parent_color = (unsigned long)parent;
node->rb_left = node->rb_right = NULL;
*rb_link = node;
}
#define rb_entry_safe(ptr, type, member) \
({ typeof(ptr) ____ptr = (ptr); \
____ptr ? rb_entry(____ptr, type, member) : NULL; \
})
/**
* rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of
* given type safe against removal of rb_node entry
*
* @pos: the 'type *' to use as a loop cursor.
* @n: another 'type *' to use as temporary storage
* @root: 'rb_root *' of the rbtree.
* @field: the name of the rb_node field within 'type'.
*/
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
typeof(*pos), field); 1; }); \
pos = n)
#endif /* _LINUX_RBTREE_H */

View File

@@ -0,0 +1,231 @@
/*
Red Black Trees
(C) 1999 Andrea Arcangeli <andrea@suse.de>
(C) 2002 David Woodhouse <dwmw2@infradead.org>
(C) 2012 Michel Lespinasse <walken@google.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
linux/include/linux/rbtree_augmented.h
*/
#ifndef _LINUX_RBTREE_AUGMENTED_H
#define _LINUX_RBTREE_AUGMENTED_H
#include <rbtree.h>
/*
* Please note - only struct rb_augment_callbacks and the prototypes for
* rb_insert_augmented() and rb_erase_augmented() are intended to be public.
* The rest are implementation details you are not expected to depend on.
*
* See Documentation/rbtree.txt for documentation and samples.
*/
struct rb_augment_callbacks {
void (*propagate)(struct rb_node *node, struct rb_node *stop);
void (*copy)(struct rb_node *old, struct rb_node *new);
void (*rotate)(struct rb_node *old, struct rb_node *new);
};
extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
static inline void
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
__rb_insert_augmented(node, root, augment->rotate);
}
#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \
rbtype, rbaugmented, rbcompute) \
static inline void \
rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \
{ \
while (rb != stop) { \
rbstruct *node = rb_entry(rb, rbstruct, rbfield); \
rbtype augmented = rbcompute(node); \
if (node->rbaugmented == augmented) \
break; \
node->rbaugmented = augmented; \
rb = rb_parent(&node->rbfield); \
} \
} \
static inline void \
rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \
{ \
rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \
rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \
new->rbaugmented = old->rbaugmented; \
} \
static void \
rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \
{ \
rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \
rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \
new->rbaugmented = old->rbaugmented; \
old->rbaugmented = rbcompute(old); \
} \
rbstatic const struct rb_augment_callbacks rbname = { \
rbname ## _propagate, rbname ## _copy, rbname ## _rotate \
};
#define RB_RED 0
#define RB_BLACK 1
#define __rb_parent(pc) ((struct rb_node *)(pc & ~3))
#define __rb_color(pc) ((pc) & 1)
#define __rb_is_black(pc) __rb_color(pc)
#define __rb_is_red(pc) (!__rb_color(pc))
#define rb_color(rb) __rb_color((rb)->__rb_parent_color)
#define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color)
#define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color)
static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
{
rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
}
static inline void rb_set_parent_color(struct rb_node *rb,
struct rb_node *p, int color)
{
rb->__rb_parent_color = (unsigned long)p | color;
}
static inline void
__rb_change_child(struct rb_node *old, struct rb_node *new,
struct rb_node *parent, struct rb_root *root)
{
if (parent) {
if (parent->rb_left == old)
parent->rb_left = new;
else
parent->rb_right = new;
} else
root->rb_node = new;
}
extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
struct rb_node *child = node->rb_right, *tmp = node->rb_left;
struct rb_node *parent, *rebalance;
unsigned long pc;
if (!tmp) {
/*
* Case 1: node to erase has no more than 1 child (easy!)
*
* Note that if there is one child it must be red due to 5)
* and node must be black due to 4). We adjust colors locally
* so as to bypass __rb_erase_color() later on.
*/
pc = node->__rb_parent_color;
parent = __rb_parent(pc);
__rb_change_child(node, child, parent, root);
if (child) {
child->__rb_parent_color = pc;
rebalance = NULL;
} else
rebalance = __rb_is_black(pc) ? parent : NULL;
tmp = parent;
} else if (!child) {
/* Still case 1, but this time the child is node->rb_left */
tmp->__rb_parent_color = pc = node->__rb_parent_color;
parent = __rb_parent(pc);
__rb_change_child(node, tmp, parent, root);
rebalance = NULL;
tmp = parent;
} else {
struct rb_node *successor = child, *child2;
tmp = child->rb_left;
if (!tmp) {
/*
* Case 2: node's successor is its right child
*
* (n) (s)
* / \ / \
* (x) (s) -> (x) (c)
* \
* (c)
*/
parent = successor;
child2 = successor->rb_right;
augment->copy(node, successor);
} else {
/*
* Case 3: node's successor is leftmost under
* node's right child subtree
*
* (n) (s)
* / \ / \
* (x) (y) -> (x) (y)
* / /
* (p) (p)
* / /
* (s) (c)
* \
* (c)
*/
do {
parent = successor;
successor = tmp;
tmp = tmp->rb_left;
} while (tmp);
parent->rb_left = child2 = successor->rb_right;
successor->rb_right = child;
rb_set_parent(child, successor);
augment->copy(node, successor);
augment->propagate(parent, successor);
}
successor->rb_left = tmp = node->rb_left;
rb_set_parent(tmp, successor);
pc = node->__rb_parent_color;
tmp = __rb_parent(pc);
__rb_change_child(node, successor, tmp, root);
if (child2) {
successor->__rb_parent_color = pc;
rb_set_parent_color(child2, parent, RB_BLACK);
rebalance = NULL;
} else {
unsigned long pc2 = successor->__rb_parent_color;
successor->__rb_parent_color = pc;
rebalance = __rb_is_black(pc2) ? parent : NULL;
}
tmp = successor;
}
augment->propagate(tmp, NULL);
return rebalance;
}
static __always_inline void
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
if (rebalance)
__rb_erase_color(rebalance, root, augment->rotate);
}
#endif /* _LINUX_RBTREE_AUGMENTED_H */

197
kernel/include/rusage.h Normal file
View File

@@ -0,0 +1,197 @@
#ifndef __RUSAGE_H
#define __RUSAGE_H
#include <config.h>
#include <ihk/rusage.h>
#ifdef ENABLE_RUSAGE
#define RUSAGE_MEM_LIMIT (2 * 1024 * 1024) // 2MB
extern void eventfd();
static inline void
rusage_total_memory_add(unsigned long size)
{
monitor->rusage_total_memory += size;
}
static inline void
rusage_rss_add(unsigned long size)
{
unsigned long newval;
unsigned long oldval;
unsigned long retval;
newval = __sync_add_and_fetch(&monitor->rusage_rss_current, size);
oldval = monitor->rusage_rss_max;
while (newval > oldval) {
retval = __sync_val_compare_and_swap(&monitor->rusage_rss_max,
oldval, newval);
if (retval == oldval) {
break;
}
oldval = retval;
}
}
static inline void
rusage_rss_sub(unsigned long size)
{
__sync_sub_and_fetch(&monitor->rusage_rss_current, size);
}
static inline void
rusage_kmem_add(unsigned long size)
{
unsigned long newval;
unsigned long oldval;
unsigned long retval;
newval = __sync_add_and_fetch(&monitor->rusage_kmem_usage, size);
oldval = monitor->rusage_kmem_max_usage;
while (newval > oldval) {
retval = __sync_val_compare_and_swap(
&monitor->rusage_kmem_max_usage,
oldval, newval);
if (retval == oldval) {
break;
}
oldval = retval;
}
}
static inline void
rusage_kmem_sub(unsigned long size)
{
__sync_sub_and_fetch(&monitor->rusage_kmem_usage, size);
}
static inline void
rusage_numa_add(int numa_id, unsigned long size)
{
__sync_add_and_fetch(monitor->rusage_numa_stat + numa_id, size);
rusage_rss_add(size);
}
static inline void
rusage_numa_sub(int numa_id, unsigned long size)
{
rusage_rss_sub(size);
__sync_sub_and_fetch(monitor->rusage_numa_stat + numa_id, size);
}
static inline void
rusage_page_add(int numa_id, unsigned long pages, int is_user)
{
unsigned long size = pages * PAGE_SIZE;
unsigned long newval;
unsigned long oldval;
unsigned long retval;
if (is_user)
rusage_numa_add(numa_id, size);
else
rusage_kmem_add(size);
newval = __sync_add_and_fetch(&monitor->rusage_total_memory_usage, size);
oldval = monitor->rusage_total_memory_max_usage;
while (newval > oldval) {
retval = __sync_val_compare_and_swap(&monitor->rusage_total_memory_max_usage,
oldval, newval);
if (retval == oldval) {
if (monitor->rusage_total_memory - newval <
RUSAGE_MEM_LIMIT) {
eventfd();
}
break;
}
oldval = retval;
}
}
static inline void
rusage_page_sub(int numa_id, unsigned long pages, int is_user)
{
unsigned long size = pages * PAGE_SIZE;
__sync_sub_and_fetch(&monitor->rusage_total_memory_usage, size);
if (is_user)
rusage_numa_sub(numa_id, size);
else
rusage_kmem_sub(size);
}
static inline void
rusage_num_threads_inc()
{
unsigned long newval;
unsigned long oldval;
unsigned long retval;
newval = __sync_add_and_fetch(&monitor->rusage_num_threads, 1);
oldval = monitor->rusage_max_num_threads;
while (newval > oldval) {
retval = __sync_val_compare_and_swap(&monitor->
rusage_max_num_threads,
oldval, newval);
if (retval == oldval) {
break;
}
oldval = retval;
}
}
static inline void
rusage_num_threads_dec()
{
__sync_sub_and_fetch(&monitor->rusage_num_threads, 1);
}
#else
static inline void
rusage_total_memory_add(unsigned long size)
{
}
static inline void
rusage_rss_add(unsigned long size)
{
}
static inline void
rusage_rss_sub(unsigned long size)
{
}
static inline void
rusage_numa_add(int numa_id, unsigned long size)
{
}
static inline void
rusage_numa_sub(int numa_id, unsigned long size)
{
}
static inline void
rusage_page_add(int numa_id, unsigned long size, int is_user)
{
}
static inline void
rusage_page_sub(int numa_id, unsigned long size, int is_user)
{
}
static inline void
rusage_num_threads_inc()
{
}
static inline void
rusage_num_threads_dec()
{
}
#endif // ENABLE_RUSAGE
#endif

View File

@@ -73,6 +73,13 @@
/* #define SCD_MSG_SYSFS_RESP_CLEANUP 0x43 */
#define SCD_MSG_PROCFS_TID_CREATE 0x44
#define SCD_MSG_PROCFS_TID_DELETE 0x45
#define SCD_MSG_EVENTFD 0x46
#define SCD_MSG_PERF_CTRL 0x50
#define SCD_MSG_PERF_ACK 0x51
#define SCD_MSG_CPU_RW_REG 0x52
#define SCD_MSG_CPU_RW_REG_RESP 0x53
/* Cloning flags. */
# define CSIGNAL 0x000000ff /* Signal mask to be sent at exit. */
@@ -149,6 +156,15 @@ struct program_image_section {
#define MCK_RLIMIT_SIGPENDING 14
#define MCK_RLIMIT_STACK 15
#define PLD_CPU_SET_MAX_CPUS 1024
typedef unsigned long __cpu_set_unit;
#define PLD_CPU_SET_SIZE (PLD_CPU_SET_MAX_CPUS / (8 * sizeof(__cpu_set_unit)))
#define MPOL_NO_HEAP 0x01
#define MPOL_NO_STACK 0x02
#define MPOL_NO_BSS 0x04
#define MPOL_SHM_PREMAP 0x08
struct program_load_desc {
int num_sections;
int status;
@@ -177,7 +193,13 @@ struct program_load_desc {
unsigned long envs_len;
struct rlimit rlimit[MCK_RLIM_MAX];
unsigned long interp_align;
unsigned long mpol_flags;
unsigned long mpol_threshold;
unsigned long heap_extension;
int nr_processes;
char shell_path[SHELL_PATH_MAX_LEN];
__cpu_set_unit cpu_set[PLD_CPU_SET_SIZE];
int profile;
struct program_image_section sections[0];
};
@@ -201,6 +223,18 @@ struct syscall_request {
unsigned long args[6];
};
struct ihk_os_cpu_register {
unsigned long addr;
unsigned long val;
unsigned long addr_ext;
};
enum mcctrl_os_cpu_operation {
MCCTRL_OS_CPU_READ_REGISTER,
MCCTRL_OS_CPU_WRITE_REGISTER,
MCCTRL_OS_CPU_MAX_OP
};
struct ikc_scd_packet {
int msg;
int err;
@@ -226,6 +260,13 @@ struct ikc_scd_packet {
struct {
int ttid;
};
/* SCD_MSG_CPU_RW_REG */
struct {
struct ihk_os_cpu_register desc;
enum mcctrl_os_cpu_operation op;
void *resp;
};
};
char padding[12];
};
@@ -250,22 +291,6 @@ struct syscall_post {
unsigned long v[8];
};
struct syscall_params {
unsigned long request_rpa, request_pa;
struct syscall_request *request_va;
unsigned long response_pa;
struct syscall_response *response_va;
unsigned long doorbell_rpa, doorbell_pa;
unsigned long *doorbell_va;
unsigned int post_idx;
unsigned long post_rpa, post_pa;
struct syscall_post *post_va;
unsigned long post_fin;
struct syscall_post post_buf IHK_DMA_ALIGN;
};
#define SYSCALL_DECLARE(name) long sys_##name(int n, ihk_mc_user_context_t *ctx)
#define SYSCALL_HEADER struct syscall_request request IHK_DMA_ALIGN; \
request.number = n
@@ -331,7 +356,7 @@ void delete_proc_procfs_files(int pid);
void create_os_procfs_files(void);
void delete_os_procfs_files(void);
#define PROCFS_NAME_MAX 1000
#define PROCFS_NAME_MAX 768
struct procfs_read {
unsigned long pbuf; /* physical address of the host buffer (request) */
@@ -385,8 +410,37 @@ struct tod_data_s {
};
extern struct tod_data_s tod_data; /* residing in arch-dependent file */
static inline void tsc_to_ts(unsigned long tsc, struct timespec *ts)
{
time_t sec_delta;
long ns_delta;
sec_delta = tsc / tod_data.clocks_per_sec;
ns_delta = NS_PER_SEC * (tsc % tod_data.clocks_per_sec)
/ tod_data.clocks_per_sec;
/* calc. of ns_delta overflows if clocks_per_sec exceeds 18.44 GHz */
ts->tv_sec = sec_delta;
ts->tv_nsec = ns_delta;
if (ts->tv_nsec >= NS_PER_SEC) {
ts->tv_nsec -= NS_PER_SEC;
++ts->tv_sec;
}
}
static inline unsigned long timeval_to_jiffy(const struct timeval *ats)
{
return ats->tv_sec * 100 + ats->tv_usec / 10000;
}
static inline unsigned long timespec_to_jiffy(const struct timespec *ats)
{
return ats->tv_sec * 100 + ats->tv_nsec / 10000000;
}
void reset_cputime();
void set_cputime(int mode);
int do_munmap(void *addr, size_t len);
intptr_t do_mmap(intptr_t addr0, size_t len0, int prot, int flags, int fd,
off_t off0);
void clear_host_pte(uintptr_t addr, size_t len);
@@ -395,6 +449,8 @@ int do_shmget(key_t key, size_t size, int shmflg);
struct process_vm;
int arch_map_vdso(struct process_vm *vm); /* arch dependent */
int arch_setup_vdso(void);
int arch_cpu_read_write_register(struct ihk_os_cpu_register *desc,
enum mcctrl_os_cpu_operation op);
#define VDSO_MAXPAGES 2
struct vdso {
@@ -431,4 +487,64 @@ struct get_cpu_mapping_req {
#endif
};
enum perf_ctrl_type {
PERF_CTRL_SET,
PERF_CTRL_GET,
PERF_CTRL_ENABLE,
PERF_CTRL_DISABLE,
};
struct perf_ctrl_desc {
enum perf_ctrl_type ctrl_type;
int status;
union {
/* for SET, GET */
struct {
unsigned int target_cntr;
unsigned long config;
unsigned long read_value;
unsigned disabled :1,
pinned :1,
exclude_user :1,
exclude_kernel :1,
exclude_hv :1,
exclude_idle :1;
};
/* for START, STOP*/
struct {
unsigned long target_cntr_mask;
};
};
};
#define UTI_FLAG_NUMA_SET (1ULL<<1) /* Indicates NUMA_SET is specified */
#define UTI_FLAG_SAME_NUMA_DOMAIN (1ULL<<2)
#define UTI_FLAG_DIFFERENT_NUMA_DOMAIN (1ULL<<3)
#define UTI_FLAG_SAME_L1 (1ULL<<4)
#define UTI_FLAG_SAME_L2 (1ULL<<5)
#define UTI_FLAG_SAME_L3 (1ULL<<6)
#define UTI_FLAG_DIFFERENT_L1 (1ULL<<7)
#define UTI_FLAG_DIFFERENT_L2 (1ULL<<8)
#define UTI_FLAG_DIFFERENT_L3 (1ULL<<9)
#define UTI_FLAG_EXCLUSIVE_CPU (1ULL<<10)
#define UTI_FLAG_CPU_INTENSIVE (1ULL<<11)
#define UTI_FLAG_HIGH_PRIORITY (1ULL<<12)
#define UTI_FLAG_NON_COOPERATIVE (1ULL<<13)
/* Linux default value is used */
#define UTI_MAX_NUMA_DOMAINS (1024)
typedef struct uti_attr {
/* UTI_CPU_SET environmental variable is used to denote the preferred
location of utility thread */
uint64_t numa_set[(UTI_MAX_NUMA_DOMAINS + sizeof(uint64_t) * 8 - 1) /
(sizeof(uint64_t) * 8)];
uint64_t flags; /* Representing location and behavior hints by bitmap */
} uti_attr_t;
#endif

View File

@@ -27,6 +27,8 @@ typedef int (*waitq_func_t)(struct waitq_entry *wait, unsigned mode,
int default_wake_function(struct waitq_entry *wait, unsigned mode, int flags,
void *key);
int locked_wake_function(struct waitq_entry *wait, unsigned mode, int flags,
void *key);
typedef struct waitq {
ihk_spinlock_t lock;
@@ -57,6 +59,13 @@ typedef struct waitq_entry {
.link = { &(name).link, &(name).link } \
}
#define DECLARE_WAITQ_ENTRY_LOCKED(name, tsk) \
waitq_entry_t name = { \
.private = tsk, \
.func = locked_wake_function, \
.link = { &(name).link, &(name).link } \
}
extern void waitq_init(waitq_t *waitq);
extern void waitq_init_entry(waitq_entry_t *entry, struct thread *proc);
extern int waitq_active(waitq_t *waitq);

26
kernel/include/xpmem.h Normal file
View File

@@ -0,0 +1,26 @@
/**
* \file xpmem.h
* License details are found in the file LICENSE.
* \brief
* Structures and functions of xpmem
*/
/*
* HISTORY
*/
#ifndef _XPMEM_H
#define _XPMEM_H
#include <process.h>
#include <ihk/context.h>
#define XPMEM_DEV_PATH "/dev/xpmem"
extern int xpmem_open(ihk_mc_user_context_t *ctx);
extern int xpmem_remove_process_memory_range(struct process_vm *vm,
struct vm_range *vmr);
extern int xpmem_fault_process_memory_range(struct process_vm *vm,
struct vm_range *vmr, unsigned long vaddr, uint64_t reason);
#endif /* _XPMEM_H */

View File

@@ -0,0 +1,490 @@
/**
* \file xpmem_private.h
* License details are found in the file LICENSE.
* \brief
* Private Cross Partition Memory (XPMEM) structures and macros.
*/
/*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
* Copyright (c) 2004-2007 Silicon Graphics, Inc. All Rights Reserved.
* Copyright 2009, 2010, 2014 Cray Inc. All Rights Reserved
* Copyright (c) 2014-2016 Los Alamos National Security, LCC. All rights
* reserved.
*/
/*
* HISTORY
*/
#ifndef _XPMEM_PRIVATE_H
#define _XPMEM_PRIVATE_H
#include <mc_xpmem.h>
#include <xpmem.h>
#define XPMEM_CURRENT_VERSION 0x00026003
//#define DEBUG_PRINT_XPMEM
#ifdef DEBUG_PRINT_XPMEM
#define dkprintf(...) kprintf(__VA_ARGS__)
#define ekprintf(...) kprintf(__VA_ARGS__)
#define XPMEM_DEBUG(format, a...) kprintf("[%d] %s: "format"\n", cpu_local_var(current)->proc->rgid, __func__, ##a)
#else
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) kprintf(__VA_ARGS__)
#define XPMEM_DEBUG(format, a...) do { if (0) kprintf("\n"); } while (0)
#endif
//#define USE_DBUG_ON
#ifdef USE_DBUG_ON
#define DBUG_ON(condition) do { if (condition) kprintf("[%d] BUG: func=%s\n", cpu_local_var(current)->proc->rgid, __func__); } while (0)
#else
#define DBUG_ON(condition)
#endif
#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
#define min(x, y) ({ \
__typeof__(x) _min1 = (x); \
__typeof__(y) _min2 = (y); \
(void) (&_min1 == &_min2); \
_min1 < _min2 ? _min1 : _min2;})
#define max(x, y) ({ \
__typeof__(x) _max1 = (x); \
__typeof__(y) _max2 = (y); \
(void) (&_max1 == &_max2); \
_max1 > _max2 ? _max1 : _max2;})
#define MAX_ERRNO 4095
#define IS_ERR_VALUE(x) ((x) >= (unsigned long)-MAX_ERRNO)
static inline void * ERR_PTR(long error)
{
return (void *)error;
}
static inline long PTR_ERR(const void *ptr)
{
return (long)ptr;
}
static inline long IS_ERR(const void *ptr)
{
return IS_ERR_VALUE((unsigned long)ptr);
}
static inline long IS_ERR_OR_NULL(const void *ptr)
{
return !ptr || IS_ERR_VALUE((unsigned long)ptr);
}
/*
* Both the xpmem_segid_t and xpmem_apid_t are of type __s64 and designed
* to be opaque to the user. Both consist of the same underlying fields.
*
* The 'uniq' field is designed to give each segid or apid a unique value.
* Each type is only unique with respect to itself.
*
* An ID is never less than or equal to zero.
*/
struct xpmem_id {
pid_t tgid; /* thread group that owns ID */
unsigned int uniq; /* this value makes the ID unique */
};
typedef union {
struct xpmem_id xpmem_id;
xpmem_segid_t segid;
xpmem_apid_t apid;
} xpmem_id_t;
/* Shift INT_MAX by one so we can tell when we overflow. */
#define XPMEM_MAX_UNIQ_ID (INT_MAX >> 1)
static inline pid_t xpmem_segid_to_tgid(xpmem_segid_t segid)
{
DBUG_ON(segid <= 0);
return ((xpmem_id_t *)&segid)->xpmem_id.tgid;
}
static inline pid_t xpmem_apid_to_tgid(xpmem_apid_t apid)
{
DBUG_ON(apid <= 0);
return ((xpmem_id_t *)&apid)->xpmem_id.tgid;
}
/*
* Hash Tables
*
* XPMEM utilizes hash tables to enable faster lookups of list entries.
* These hash tables are implemented as arrays. A simple modulus of the hash
* key yields the appropriate array index. A hash table's array element (i.e.,
* hash table bucket) consists of a hash list and the lock that protects it.
*
* XPMEM has the following two hash tables:
*
* table bucket key
* part->tg_hashtable list of struct xpmem_thread_group tgid
* tg->ap_hashtable list of struct xpmem_access_permit apid.uniq
*/
struct xpmem_hashlist {
mcs_rwlock_lock_t lock; /* lock for hash list */
struct list_head list; /* hash list */
};
#define XPMEM_TG_HASHTABLE_SIZE 8
#define XPMEM_AP_HASHTABLE_SIZE 8
static inline int xpmem_tg_hashtable_index(pid_t tgid)
{
int index;
index = (unsigned int)tgid % XPMEM_TG_HASHTABLE_SIZE;
XPMEM_DEBUG("return: tgid=%lu, index=%d", tgid, index);
return index;
}
static inline int xpmem_ap_hashtable_index(xpmem_apid_t apid)
{
int index;
DBUG_ON(apid <= 0);
index = ((xpmem_id_t *)&apid)->xpmem_id.uniq % XPMEM_AP_HASHTABLE_SIZE;
XPMEM_DEBUG("return: apid=0x%lx, index=%d", apid, index);
return index;
}
/*
* general internal driver structures
*/
struct xpmem_thread_group {
ihk_spinlock_t lock; /* tg lock */
pid_t tgid; /* tg's tgid */
uid_t uid; /* tg's uid */
gid_t gid; /* tg's gid */
volatile int flags; /* tg attributes and state */
ihk_atomic_t uniq_segid; /* segid uniq */
ihk_atomic_t uniq_apid; /* apid uniq */
mcs_rwlock_lock_t seg_list_lock; /* tg's list of segs lock */
struct list_head seg_list; /* tg's list of segs */
ihk_atomic_t refcnt; /* references to tg */
ihk_atomic_t n_pinned; /* #of pages pinned by this tg */
struct list_head tg_hashlist; /* tg hash list */
struct thread *group_leader; /* thread group leader */
struct process_vm *vm; /* tg's process_vm */
struct xpmem_hashlist ap_hashtable[]; /* locks + ap hash lists */
};
struct xpmem_segment {
ihk_spinlock_t lock; /* seg lock */
xpmem_segid_t segid; /* unique segid */
unsigned long vaddr; /* starting address */
size_t size; /* size of seg */
int permit_type; /* permission scheme */
void *permit_value; /* permission data */
volatile int flags; /* seg attributes and state */
ihk_atomic_t refcnt; /* references to seg */
struct xpmem_thread_group *tg; /* creator tg */
struct list_head ap_list; /* local access permits of seg */
struct list_head seg_list; /* tg's list of segs */
};
struct xpmem_access_permit {
ihk_spinlock_t lock; /* access permit lock */
xpmem_apid_t apid; /* unique apid */
int mode; /* read/write mode */
volatile int flags; /* access permit attributes and state */
ihk_atomic_t refcnt; /* references to access permit */
struct xpmem_segment *seg; /* seg permitted to be accessed */
struct xpmem_thread_group *tg; /* access permit's tg */
struct list_head att_list; /* atts of this access permit's seg */
struct list_head ap_list; /* access permits linked to seg */
struct list_head ap_hashlist; /* access permit hash list */
};
struct xpmem_attachment {
mcs_rwlock_lock_t at_lock; /* att lock */
unsigned long vaddr; /* starting address of seg attached */
unsigned long at_vaddr; /* address where seg is attached */
size_t at_size; /* size of seg attachment */
struct vm_range *at_vmr; /* vm_range where seg is attachment */
volatile int flags; /* att attributes and state */
ihk_atomic_t refcnt; /* references to att */
struct xpmem_access_permit *ap; /* associated access permit */
struct list_head att_list; /* atts linked to access permit */
struct process_vm *vm; /* process_vm attached to */
};
struct xpmem_partition {
ihk_atomic_t n_opened; /* # of /dev/xpmem opened */
struct xpmem_hashlist tg_hashtable[]; /* locks + tg hash lists */
};
#define XPMEM_FLAG_DESTROYING 0x00040 /* being destroyed */
#define XPMEM_FLAG_DESTROYED 0x00080 /* 'being destroyed' finished */
#define XPMEM_FLAG_VALIDPTEs 0x00200 /* valid PTEs exist */
struct xpmem_perm {
uid_t uid;
gid_t gid;
unsigned long mode;
};
#define XPMEM_PERM_IRUSR 00400
#define XPMEM_PERM_IWUSR 00200
extern struct xpmem_partition *xpmem_my_part;
static int xpmem_ioctl(struct mckfd *mckfd, ihk_mc_user_context_t *ctx);
static int xpmem_close(struct mckfd *mckfd, ihk_mc_user_context_t *ctx);
static int xpmem_init(void);
static void xpmem_exit(void);
static int __xpmem_open(void);
static void xpmem_destroy_tg(struct xpmem_thread_group *);
static int xpmem_make(unsigned long, size_t, int, void *, xpmem_segid_t *);
static xpmem_segid_t xpmem_make_segid(struct xpmem_thread_group *);
static int xpmem_remove(xpmem_segid_t);
static void xpmem_remove_seg(struct xpmem_thread_group *,
struct xpmem_segment *);
static void xpmem_remove_segs_of_tg(struct xpmem_thread_group *seg_tg);
static int xpmem_get(xpmem_segid_t, int, int, void *, xpmem_apid_t *);
static int xpmem_check_permit_mode(int, struct xpmem_segment *);
static int xpmem_perms(struct xpmem_perm *, short);
static xpmem_apid_t xpmem_make_apid(struct xpmem_thread_group *);
static int xpmem_release(xpmem_apid_t);
static void xpmem_release_ap(struct xpmem_thread_group *,
struct xpmem_access_permit *);
static void xpmem_release_aps_of_tg(struct xpmem_thread_group *ap_tg);
static int xpmem_attach(struct mckfd *, xpmem_apid_t, off_t, size_t,
unsigned long, int, int, unsigned long *);
static int xpmem_detach(unsigned long);
static int xpmem_vm_munmap(struct process_vm *vm, void *addr, size_t len);
static int xpmem_remove_process_range(struct process_vm *vm,
unsigned long start, unsigned long end, int *ro_freedp);
static int xpmem_free_process_memory_range(struct process_vm *vm,
struct vm_range *range);
static void xpmem_detach_att(struct xpmem_access_permit *,
struct xpmem_attachment *);
static void xpmem_clear_PTEs(struct xpmem_segment *);
static void xpmem_clear_PTEs_range(struct xpmem_segment *, unsigned long,
unsigned long);
static void xpmem_clear_PTEs_of_ap(struct xpmem_access_permit *, unsigned long,
unsigned long);
static void xpmem_clear_PTEs_of_att(struct xpmem_attachment *, unsigned long,
unsigned long);
static int xpmem_remap_pte(struct process_vm *, struct vm_range *,
unsigned long, uint64_t, struct xpmem_segment *, unsigned long);
static int xpmem_ensure_valid_page(struct xpmem_segment *, unsigned long);
static pte_t * xpmem_vaddr_to_pte(struct process_vm *, unsigned long,
size_t *pgsize);
static int xpmem_pin_page(struct xpmem_thread_group *, struct thread *,
struct process_vm *, unsigned long);
static void xpmem_unpin_pages(struct xpmem_segment *, struct process_vm *,
unsigned long, size_t);
static struct xpmem_thread_group * __xpmem_tg_ref_by_tgid_nolock_internal(
pid_t, int, int);
static inline struct xpmem_thread_group *__xpmem_tg_ref_by_tgid(
pid_t tgid,
int return_destroying)
{
struct xpmem_thread_group *tg;
int index;
struct mcs_rwlock_node_irqsave lock;
XPMEM_DEBUG("call: tgid=%d, return_destroying=%d",
tgid, return_destroying);
index = xpmem_tg_hashtable_index(tgid);
mcs_rwlock_reader_lock(&xpmem_my_part->tg_hashtable[index].lock, &lock);
tg = __xpmem_tg_ref_by_tgid_nolock_internal(tgid, index,
return_destroying);
mcs_rwlock_reader_unlock(&xpmem_my_part->tg_hashtable[index].lock,
&lock);
XPMEM_DEBUG("return: tg=0x%p", tg);
return tg;
}
static inline struct xpmem_thread_group *__xpmem_tg_ref_by_tgid_nolock(
pid_t tgid,
int return_destroying)
{
struct xpmem_thread_group *tg;
XPMEM_DEBUG("call: tgid=%d, return_destroying=%d",
tgid, return_destroying);
tg = __xpmem_tg_ref_by_tgid_nolock_internal(tgid,
xpmem_tg_hashtable_index(tgid), return_destroying);
XPMEM_DEBUG("return: tg=0x%p", tg);
return tg;
}
#define xpmem_tg_ref_by_tgid(t) __xpmem_tg_ref_by_tgid(t, 0)
#define xpmem_tg_ref_by_tgid_all(t) __xpmem_tg_ref_by_tgid(t, 1)
#define xpmem_tg_ref_by_tgid_nolock(t) __xpmem_tg_ref_by_tgid_nolock(t, 0)
#define xpmem_tg_ref_by_tgid_all_nolock(t) __xpmem_tg_ref_by_tgid_nolock(t, 1)
static struct xpmem_thread_group * xpmem_tg_ref_by_segid(xpmem_segid_t);
static struct xpmem_thread_group * xpmem_tg_ref_by_apid(xpmem_apid_t);
static void xpmem_tg_deref(struct xpmem_thread_group *);
static struct xpmem_segment *xpmem_seg_ref_by_segid(struct xpmem_thread_group *,
xpmem_segid_t);
static void xpmem_seg_deref(struct xpmem_segment *);
static struct xpmem_access_permit * xpmem_ap_ref_by_apid(
struct xpmem_thread_group *, xpmem_apid_t);
static void xpmem_ap_deref(struct xpmem_access_permit *);
static void xpmem_att_deref(struct xpmem_attachment *);
static int xpmem_validate_access(struct xpmem_access_permit *, off_t, size_t,
int, unsigned long *);
/*
* Inlines that mark an internal driver structure as being destroyable or not.
* The idea is to set the refcnt to 1 at structure creation time and then
* drop that reference at the time the structure is to be destroyed.
*/
static inline void xpmem_tg_not_destroyable(
struct xpmem_thread_group *tg)
{
ihk_atomic_set(&tg->refcnt, 1);
XPMEM_DEBUG("return: tg->refcnt=%d", tg->refcnt);
}
static inline void xpmem_tg_destroyable(
struct xpmem_thread_group *tg)
{
XPMEM_DEBUG("call: ");
xpmem_tg_deref(tg);
XPMEM_DEBUG("return: ");
}
static inline void xpmem_seg_not_destroyable(
struct xpmem_segment *seg)
{
ihk_atomic_set(&seg->refcnt, 1);
XPMEM_DEBUG("return: seg->refcnt=%d", seg->refcnt);
}
static inline void xpmem_seg_destroyable(
struct xpmem_segment *seg)
{
XPMEM_DEBUG("call: ");
xpmem_seg_deref(seg);
XPMEM_DEBUG("return: ");
}
static inline void xpmem_ap_not_destroyable(
struct xpmem_access_permit *ap)
{
ihk_atomic_set(&ap->refcnt, 1);
XPMEM_DEBUG("return: ap->refcnt=%d", ap->refcnt);
}
static inline void xpmem_ap_destroyable(
struct xpmem_access_permit *ap)
{
XPMEM_DEBUG("call: ");
xpmem_ap_deref(ap);
XPMEM_DEBUG("return: ");
}
static inline void xpmem_att_not_destroyable(
struct xpmem_attachment *att)
{
ihk_atomic_set(&att->refcnt, 1);
XPMEM_DEBUG("return: att->refcnt=%d", att->refcnt);
}
static inline void xpmem_att_destroyable(
struct xpmem_attachment *att)
{
XPMEM_DEBUG("call: ");
xpmem_att_deref(att);
XPMEM_DEBUG("return: ");
}
/*
* Inlines that increment the refcnt for the specified structure.
*/
static inline void xpmem_tg_ref(
struct xpmem_thread_group *tg)
{
DBUG_ON(ihk_atomic_read(&tg->refcnt) <= 0);
ihk_atomic_inc(&tg->refcnt);
XPMEM_DEBUG("return: tg->refcnt=%d", tg->refcnt);
}
static inline void xpmem_seg_ref(
struct xpmem_segment *seg)
{
DBUG_ON(ihk_atomic_read(&seg->refcnt) <= 0);
ihk_atomic_inc(&seg->refcnt);
XPMEM_DEBUG("return: seg->refcnt=%d", seg->refcnt);
}
static inline void xpmem_ap_ref(
struct xpmem_access_permit *ap)
{
DBUG_ON(ihk_atomic_read(&ap->refcnt) <= 0);
ihk_atomic_inc(&ap->refcnt);
XPMEM_DEBUG("return: ap->refcnt=%d", ap->refcnt);
}
static inline void xpmem_att_ref(
struct xpmem_attachment *att)
{
DBUG_ON(ihk_atomic_read(&att->refcnt) <= 0);
ihk_atomic_inc(&att->refcnt);
XPMEM_DEBUG("return: att->refcnt=%d", att->refcnt);
}
static inline int xpmem_is_private_data(
struct vm_range *vmr)
{
return (vmr->private_data != NULL);
}
#endif /* _XPMEM_PRIVATE_H */

View File

@@ -31,6 +31,7 @@
#include <cls.h>
#include <syscall.h>
#include <sysfs.h>
#include <rusage.h>
//#define IOCTL_FUNC_EXTENSION
#ifdef IOCTL_FUNC_EXTENSION
@@ -40,17 +41,21 @@
//#define DEBUG_PRINT_INIT
#ifdef DEBUG_PRINT_INIT
#define dkprintf kprintf
#define dkprintf(...) do { kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) do { kprintf(__VA_ARGS__); } while (0)
#else
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define dkprintf(...) do { } while (0)
#define ekprintf(...) do { kprintf(__VA_ARGS__); } while (0)
#endif
int osnum = 0;
extern struct ihk_kmsg_buf kmsg_buf;
extern unsigned long ihk_mc_get_ns_per_tsc(void);
extern long syscall(int, ihk_mc_user_context_t *);
struct ihk_os_monitor *monitor;
static void handler_init(void)
{
ihk_mc_set_syscall_handler(syscall);
@@ -108,11 +113,11 @@ static void dma_test(void)
}
#endif
extern char *ihk_mc_get_kernel_args(void);
extern char *ihk_get_kargs(void);
char *find_command_line(char *name)
{
char *cmdline = ihk_mc_get_kernel_args();
char *cmdline = ihk_get_kargs();
if (!cmdline) {
return NULL;
@@ -122,7 +127,7 @@ char *find_command_line(char *name)
static void parse_kargs(void)
{
kprintf("KCommand Line: %s\n", ihk_mc_get_kernel_args());
kprintf("KCommand Line: %s\n", ihk_get_kargs());
if (1) {
char *key = "osnum=";
@@ -239,6 +244,34 @@ static void time_init(void)
return;
}
static void monitor_init()
{
int z;
unsigned long phys;
z = sizeof(struct ihk_os_monitor) +
sizeof(struct ihk_os_cpu_monitor) * num_processors;
z = (z + PAGE_SIZE -1) >> PAGE_SHIFT;
monitor = ihk_mc_alloc_pages(z, IHK_MC_AP_CRITICAL);
memset(monitor, 0, z * PAGE_SIZE);
monitor->num_processors = num_processors;
monitor->num_numa_nodes = ihk_mc_get_nr_numa_nodes();
monitor->ns_per_tsc = ihk_mc_get_ns_per_tsc();
phys = virt_to_phys(monitor);
ihk_set_monitor(phys, sizeof(struct ihk_os_monitor) +
sizeof(struct ihk_os_cpu_monitor) * num_processors);
}
int nmi_mode;
static void nmi_init()
{
unsigned long phys;
phys = virt_to_phys(&nmi_mode);
ihk_set_nmi_mode_addr(phys);
}
static void rest_init(void)
{
handler_init();
@@ -250,11 +283,13 @@ static void rest_init(void)
//pc_test();
ap_init();
monitor_init();
cpu_local_var_init();
nmi_init();
time_init();
kmalloc_init();
ikc_master_init();
ihk_ikc_master_init();
proc_init();
@@ -320,7 +355,8 @@ static void setup_remote_snooping_samples(void)
static void populate_sysfs(void)
{
cpu_sysfs_setup();
setup_remote_snooping_samples();
numa_sysfs_setup();
//setup_remote_snooping_samples();
} /* populate_sysfs() */
int host_ikc_inited = 0;
@@ -336,11 +372,12 @@ static void post_init(void)
}
if (find_command_line("hidos")) {
extern ihk_spinlock_t syscall_lock;
init_host_syscall_channel();
init_host_syscall_channel2();
ihk_mc_spinlock_init(&syscall_lock);
int ikc_cpu = ihk_mc_get_ikc_cpu(ihk_mc_get_processor_id());
if(ikc_cpu < 0) {
ekprintf("%s,ihk_mc_get_ikc_cpu failed\n", __FUNCTION__);
}
init_host_ikc2mckernel();
init_host_ikc2linux(ikc_cpu);
}
arch_setup_vdso();
@@ -372,7 +409,7 @@ int main(void)
kmsg_init(mode);
kputs("IHK/McKernel started.\n");
ihk_set_kmsg(virt_to_phys(&kmsg_buf), IHK_KMSG_SIZE);
arch_init();
/*

File diff suppressed because it is too large Load Diff

View File

@@ -21,7 +21,7 @@ static struct ihk_ikc_channel_desc *mchannel;
static int arch_master_channel_packet_handler(struct ihk_ikc_channel_desc *,
void *__packet, void *arg);
void ikc_master_init(void)
void ihk_ikc_master_init(void)
{
mchannel = kmalloc(sizeof(struct ihk_ikc_channel_desc) +
sizeof(struct ihk_ikc_master_packet),

File diff suppressed because it is too large Load Diff

View File

@@ -17,12 +17,14 @@
#include <ihk/debug.h>
#include <ihk/ikc.h>
#include <ikc/master.h>
#include <syscall.h>
#include <cls.h>
#include <syscall.h>
#include <kmalloc.h>
#include <process.h>
#include <page.h>
#include <mman.h>
#include <bitmap.h>
#include <init.h>
//#define DEBUG_PRINT_PROCFS
@@ -35,6 +37,7 @@
extern int snprintf(char * buf, size_t size, const char *fmt, ...);
extern int sprintf(char * buf, const char *fmt, ...);
extern int sscanf(const char * buf, const char * fmt, ...);
extern int scnprintf(char * buf, size_t size, const char *fmt, ...);
extern int osnum;
@@ -44,7 +47,7 @@ procfs_thread_ctl(struct thread *thread, int msg)
struct ihk_ikc_channel_desc *syscall_channel;
struct ikc_scd_packet packet;
syscall_channel = cpu_local_var(syscall_channel);
syscall_channel = cpu_local_var(ikc2linux);
memset(&packet, '\0', sizeof packet);
packet.arg = thread->tid;
packet.msg = msg;
@@ -73,11 +76,11 @@ procfs_delete_thread(struct thread *thread)
*
* \param rarg returned argument
*/
void
process_procfs_request(unsigned long rarg)
void process_procfs_request(struct ikc_scd_packet *rpacket)
{
unsigned long rarg = rpacket->arg;
unsigned long parg, pbuf;
struct thread *thread = NULL;
struct thread *thread = NULL;
struct process *proc = NULL;
struct process_vm *vm = NULL;
struct procfs_read *r;
@@ -93,7 +96,7 @@ process_procfs_request(unsigned long rarg)
dprintf("process_procfs_request: invoked.\n");
syscall_channel = get_cpu_local_var(0)->syscall_channel;
syscall_channel = get_cpu_local_var(0)->ikc2linux;
dprintf("rarg: %x\n", rarg);
parg = ihk_mc_map_memory(NULL, rarg, sizeof(struct procfs_read));
@@ -158,7 +161,7 @@ process_procfs_request(unsigned long rarg)
*/
ret = sscanf(p, "%d/", &pid);
if (ret == 1) {
struct mcs_rwlock_node tlock;
struct mcs_rwlock_node_irqsave tlock;
int tids;
struct thread *thread1 = NULL;
@@ -175,7 +178,7 @@ process_procfs_request(unsigned long rarg)
else
tid = pid;
mcs_rwlock_reader_lock_noirq(&proc->threads_lock, &tlock);
mcs_rwlock_reader_lock(&proc->threads_lock, &tlock);
list_for_each_entry(thread, &proc->threads_list, siblings_list){
if(thread->tid == tid)
break;
@@ -185,15 +188,15 @@ process_procfs_request(unsigned long rarg)
if(thread == NULL){
kprintf("process_procfs_request: no such tid %d-%d\n", pid, tid);
if(tids){
mcs_rwlock_reader_unlock(&proc->threads_lock, &tlock);
process_unlock(proc, &lock);
mcs_rwlock_reader_unlock_noirq(&proc->threads_lock, &tlock);
goto end;
}
thread = thread1;
}
if(thread)
hold_thread(thread);
mcs_rwlock_reader_unlock_noirq(&proc->threads_lock, &tlock);
mcs_rwlock_reader_unlock(&proc->threads_lock, &tlock);
hold_process(proc);
vm = proc->vm;
if(vm)
@@ -404,12 +407,34 @@ process_procfs_request(unsigned long rarg)
/*
* mcos%d/PID/status
*/
#define BITMASKS_BUF_SIZE 2048
if (strcmp(p, "status") == 0) {
extern int num_processors; /* kernel/ap.c */
struct vm_range *range;
unsigned long lockedsize = 0;
char tmp[1024];
char *tmp;
char *bitmasks;
int bitmasks_offset = 0;
char *cpu_bitmask, *cpu_list, *numa_bitmask, *numa_list;
int len;
tmp = kmalloc(8192, IHK_MC_AP_CRITICAL);
if (!tmp) {
kprintf("%s: error allocating /proc/self/status buffer\n",
__FUNCTION__);
ans = 0;
goto end;
}
bitmasks = kmalloc(BITMASKS_BUF_SIZE, IHK_MC_AP_CRITICAL);
if (!tmp) {
kprintf("%s: error allocating /proc/self/status bitmaks buffer\n",
__FUNCTION__);
kfree(tmp);
ans = 0;
goto end;
}
ihk_mc_spinlock_lock_noirq(&proc->vm->memory_range_lock);
list_for_each_entry(range, &proc->vm->vm_range_list, list) {
if(range->flag & VR_LOCKED)
@@ -417,13 +442,42 @@ process_procfs_request(unsigned long rarg)
}
ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock);
cpu_bitmask = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnprintf(cpu_bitmask,
BITMASKS_BUF_SIZE - bitmasks_offset,
thread->cpu_set.__bits, num_processors);
bitmasks_offset++;
cpu_list = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnlistprintf(cpu_list,
BITMASKS_BUF_SIZE - bitmasks_offset,
thread->cpu_set.__bits, __CPU_SETSIZE);
bitmasks_offset++;
numa_bitmask = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnprintf(numa_bitmask,
BITMASKS_BUF_SIZE - bitmasks_offset,
proc->vm->numa_mask, PROCESS_NUMA_MASK_BITS);
bitmasks_offset++;
numa_list = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnlistprintf(numa_list,
BITMASKS_BUF_SIZE - bitmasks_offset,
proc->vm->numa_mask, PROCESS_NUMA_MASK_BITS);
bitmasks_offset++;
sprintf(tmp,
"Uid:\t%d\t%d\t%d\t%d\n"
"Gid:\t%d\t%d\t%d\t%d\n"
"VmLck:\t%9lu kB\n",
"VmLck:\t%9lu kB\n"
"Cpus_allowed:\t%s\n"
"Cpus_allowed_list:\t%s\n"
"Mems_allowed:\t%s\n"
"Mems_allowed_list:\t%s\n",
proc->ruid, proc->euid, proc->suid, proc->fsuid,
proc->rgid, proc->egid, proc->sgid, proc->fsgid,
(lockedsize + 1023) >> 10);
(lockedsize + 1023) >> 10,
cpu_bitmask, cpu_list, numa_bitmask, numa_list);
len = strlen(tmp);
if (r->offset < len) {
if (r->offset + r->count < len) {
@@ -437,6 +491,8 @@ process_procfs_request(unsigned long rarg)
ans = 0;
eof = 1;
}
kfree(tmp);
kfree(bitmasks);
goto end;
}
@@ -577,6 +633,7 @@ dataunavail:
packet.msg = SCD_MSG_PROCFS_ANSWER;
packet.arg = rarg;
packet.pid = rpacket->pid;
ret = ihk_ikc_send(syscall_channel, &packet, 0);
if (ret < 0) {

589
kernel/profile.c Normal file
View File

@@ -0,0 +1,589 @@
/**
* \file profile.c
* License details are found in the file LICENSE.
*
* \brief
* Profiler code for various process statistics
* \author Balazs Gerofi <bgerofi@riken.jp>
* Copyright (C) 2017 RIKEN AICS
*/
/*
* HISTORY:
*/
#include <types.h>
#include <kmsg.h>
#include <ihk/cpu.h>
#include <cpulocal.h>
#include <ihk/mm.h>
#include <ihk/debug.h>
#include <ihk/ikc.h>
#include <errno.h>
#include <cls.h>
#include <syscall.h>
#include <page.h>
#include <ihk/lock.h>
#include <ctype.h>
#include <waitq.h>
#include <rlimit.h>
#include <affinity.h>
#include <time.h>
#include <ihk/perfctr.h>
#include <mman.h>
#include <kmalloc.h>
#include <memobj.h>
#include <shm.h>
#include <prio.h>
#include <arch/cpu.h>
#include <limits.h>
#include <march.h>
#include <process.h>
extern char *syscall_name[];
#ifdef PROFILE_ENABLE
//#define DEBUG_PRINT_PROFILE
#ifdef DEBUG_PRINT_PROFILE
#define dkprintf(...) kprintf(__VA_ARGS__)
#define ekprintf(...) kprintf(__VA_ARGS__)
#else
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) kprintf(__VA_ARGS__)
#endif
char *profile_event_names[] =
{
"remote_tlb_invalidate",
"page_fault",
"page_fault_anon_clr_mem",
"page_fault_file",
"page_fault_dev_file",
"page_fault_file_clr_mem",
"mpol_alloc_missed",
"mmap_anon_contig_phys",
"mmap_anon_no_contig_phys",
"mmap_regular_file",
"mmap_device_file",
""
};
mcs_lock_node_t job_profile_lock = {0, NULL};
struct profile_event *job_profile_events = NULL;
int job_nr_processes = -1;
int job_nr_processes_left = -1;
unsigned long job_elapsed_ts;
enum profile_event_type profile_syscall2offload(enum profile_event_type sc)
{
return (PROFILE_SYSCALL_MAX + sc);
}
void profile_event_add(enum profile_event_type type, uint64_t tsc)
{
struct profile_event *event = NULL;
if (!cpu_local_var(current)->profile)
return;
if (!cpu_local_var(current)->profile_events) {
if (profile_alloc_events(cpu_local_var(current)) < 0)
return;
}
if (type < PROFILE_EVENT_MAX) {
event = &cpu_local_var(current)->profile_events[type];
}
else {
kprintf("%s: WARNING: unknown event type %d\n",
__FUNCTION__, type);
return;
}
++event->cnt;
event->tsc += tsc;
}
void profile_print_thread_stats(struct thread *thread)
{
int i;
unsigned long flags;
if (!thread->profile_events)
return;
/* Not yet accumulated period? */
if (thread->profile_start_ts) {
thread->profile_elapsed_ts += (rdtsc() - thread->profile_start_ts);
}
flags = kprintf_lock();
__kprintf("TID: %4d elapsed cycles (excluding idle): %luk\n",
thread->tid,
thread->profile_elapsed_ts / 1000);
for (i = 0; i < PROFILE_SYSCALL_MAX; ++i) {
if (!thread->profile_events[i].cnt &&
!thread->profile_events[i + PROFILE_SYSCALL_MAX].cnt)
continue;
__kprintf("TID: %4d (%3d,%20s): %6u %6luk offl: %6u %6luk (%2d.%2d%%)\n",
thread->tid,
i,
syscall_name[i],
thread->profile_events[i].cnt,
(thread->profile_events[i].tsc /
(thread->profile_events[i].cnt ?
thread->profile_events[i].cnt : 1))
/ 1000,
thread->profile_events[i + PROFILE_SYSCALL_MAX].cnt,
(thread->profile_events[i + PROFILE_SYSCALL_MAX].tsc /
(thread->profile_events[i + PROFILE_SYSCALL_MAX].cnt ?
thread->profile_events[i + PROFILE_SYSCALL_MAX].cnt : 1))
/ 1000,
(thread->profile_events[i].tsc ?
thread->profile_events[i].tsc * 100
/ thread->profile_elapsed_ts : 0),
(thread->profile_events[i].tsc ?
(thread->profile_events[i].tsc * 10000
/ thread->profile_elapsed_ts) % 100 : 0)
);
}
for (i = PROFILE_EVENT_MIN; i < PROFILE_EVENT_MAX; ++i) {
if (!thread->profile_events[i].cnt)
continue;
__kprintf("TID: %4d (%24s): %6u %6luk \n",
thread->tid,
profile_event_names[i - PROFILE_EVENT_MIN],
thread->profile_events[i].cnt,
(thread->profile_events[i].tsc /
(thread->profile_events[i].cnt ?
thread->profile_events[i].cnt : 1))
/ 1000,
(thread->profile_events[i].tsc ?
thread->profile_events[i].tsc * 100
/ thread->profile_elapsed_ts : 0),
(thread->profile_events[i].tsc ?
(thread->profile_events[i].tsc * 10000
/ thread->profile_elapsed_ts) % 100 : 0)
);
}
kprintf_unlock(flags);
}
void profile_print_proc_stats(struct process *proc)
{
int i;
unsigned long flags;
if (!proc->profile_events || !proc->profile_elapsed_ts)
return;
flags = kprintf_lock();
__kprintf("PID: %4d elapsed cycles for all threads (excluding idle): %luk\n",
proc->pid,
proc->profile_elapsed_ts / 1000);
for (i = 0; i < PROFILE_SYSCALL_MAX; ++i) {
if (!proc->profile_events[i].cnt &&
!proc->profile_events[i + PROFILE_SYSCALL_MAX].cnt)
continue;
__kprintf("PID: %4d (%3d,%20s): %6u %6luk offl: %6u %6luk (%2d.%2d%%)\n",
proc->pid,
i,
syscall_name[i],
proc->profile_events[i].cnt,
(proc->profile_events[i].tsc /
(proc->profile_events[i].cnt ?
proc->profile_events[i].cnt : 1))
/ 1000,
proc->profile_events[i + PROFILE_SYSCALL_MAX].cnt,
(proc->profile_events[i + PROFILE_SYSCALL_MAX].tsc /
(proc->profile_events[i + PROFILE_SYSCALL_MAX].cnt ?
proc->profile_events[i + PROFILE_SYSCALL_MAX].cnt : 1))
/ 1000,
(proc->profile_events[i].tsc ?
proc->profile_events[i].tsc * 100
/ proc->profile_elapsed_ts : 0),
(proc->profile_events[i].tsc ?
(proc->profile_events[i].tsc * 10000
/ proc->profile_elapsed_ts) % 100 : 0)
);
}
for (i = PROFILE_EVENT_MIN; i < PROFILE_EVENT_MAX; ++i) {
if (!proc->profile_events[i].cnt)
continue;
__kprintf("PID: %4d (%24s): %6u %6luk \n",
proc->pid,
profile_event_names[i - PROFILE_EVENT_MIN],
proc->profile_events[i].cnt,
(proc->profile_events[i].tsc /
(proc->profile_events[i].cnt ?
proc->profile_events[i].cnt : 1))
/ 1000,
(proc->profile_events[i].tsc &&
proc->profile_elapsed_ts ?
proc->profile_events[i].tsc * 100
/ proc->profile_elapsed_ts : 0),
(proc->profile_events[i].tsc &&
proc->profile_elapsed_ts ?
(proc->profile_events[i].tsc * 10000
/ proc->profile_elapsed_ts) % 100 : 0)
);
}
kprintf_unlock(flags);
}
int profile_accumulate_and_print_job_events(struct process *proc)
{
int i;
unsigned long flags;
struct mcs_lock_node mcs_node;
mcs_lock_lock(&job_profile_lock, &mcs_node);
/* First process? */
if (job_nr_processes == -1) {
job_nr_processes = proc->nr_processes;
job_nr_processes_left = proc->nr_processes;
job_elapsed_ts = 0;
}
--job_nr_processes_left;
/* Allocate event counters */
if (!job_profile_events) {
job_profile_events = kmalloc(sizeof(*job_profile_events) *
PROFILE_EVENT_MAX, IHK_MC_AP_NOWAIT);
if (!job_profile_events) {
kprintf("%s: ERROR: allocating job profile counters\n",
__FUNCTION__);
return -ENOMEM;
}
memset(job_profile_events, 0,
sizeof(*job_profile_events) * PROFILE_EVENT_MAX);
}
/* Accumulate process */
for (i = 0; i < PROFILE_EVENT_MAX; ++i) {
if (!proc->profile_events[i].tsc)
continue;
job_profile_events[i].tsc += proc->profile_events[i].tsc;
job_profile_events[i].cnt += proc->profile_events[i].cnt;
proc->profile_events[i].tsc = 0;
proc->profile_events[i].cnt = 0;
}
job_elapsed_ts += proc->profile_elapsed_ts;
/* Last process? */
if (job_nr_processes_left == 0) {
flags = kprintf_lock();
__kprintf("JOB: (%2d) elapsed cycles for all threads (excluding idle): %luk\n",
job_nr_processes,
job_elapsed_ts / 1000);
for (i = 0; i < PROFILE_SYSCALL_MAX; ++i) {
if (!job_profile_events[i].cnt &&
!job_profile_events[i + PROFILE_SYSCALL_MAX].cnt)
continue;
__kprintf("JOB: (%2d) (%3d,%20s): %6u %6luk offl: %6u %6luk (%2d.%2d%%)\n",
job_nr_processes,
i,
syscall_name[i],
job_profile_events[i].cnt,
(job_profile_events[i].tsc /
(job_profile_events[i].cnt ?
job_profile_events[i].cnt : 1))
/ 1000,
job_profile_events[i + PROFILE_SYSCALL_MAX].cnt,
(job_profile_events[i + PROFILE_SYSCALL_MAX].tsc /
(job_profile_events[i + PROFILE_SYSCALL_MAX].cnt ?
job_profile_events[i + PROFILE_SYSCALL_MAX].cnt : 1))
/ 1000,
(job_profile_events[i].tsc ?
job_profile_events[i].tsc * 100
/ job_elapsed_ts : 0),
(job_profile_events[i].tsc ?
(job_profile_events[i].tsc * 10000
/ job_elapsed_ts) % 100 : 0)
);
job_profile_events[i].tsc = 0;
job_profile_events[i].cnt = 0;
job_profile_events[i + PROFILE_SYSCALL_MAX].tsc = 0;
job_profile_events[i + PROFILE_SYSCALL_MAX].cnt = 0;
}
for (i = PROFILE_EVENT_MIN; i < PROFILE_EVENT_MAX; ++i) {
if (!job_profile_events[i].cnt)
continue;
__kprintf("JOB: (%2d) (%24s): %6u %6luk \n",
job_nr_processes,
profile_event_names[i - PROFILE_EVENT_MIN],
job_profile_events[i].cnt,
(job_profile_events[i].tsc /
(job_profile_events[i].cnt ?
job_profile_events[i].cnt : 1))
/ 1000);
job_profile_events[i].tsc = 0;
job_profile_events[i].cnt = 0;
}
kprintf_unlock(flags);
/* Reset job process indicators */
job_nr_processes = -1;
job_nr_processes_left = -1;
job_elapsed_ts = 0;
}
mcs_lock_unlock(&job_profile_lock, &mcs_node);
return 0;
}
void profile_accumulate_events(struct thread *thread,
struct process *proc)
{
int i;
struct mcs_lock_node mcs_node;
if (!thread->profile_events || !proc->profile_events) return;
mcs_lock_lock(&proc->profile_lock, &mcs_node);
for (i = 0; i < PROFILE_EVENT_MAX; ++i) {
proc->profile_events[i].tsc += thread->profile_events[i].tsc;
proc->profile_events[i].cnt += thread->profile_events[i].cnt;
thread->profile_events[i].tsc = 0;
thread->profile_events[i].cnt = 0;
}
proc->profile_elapsed_ts += thread->profile_elapsed_ts;
if (thread->profile_start_ts) {
proc->profile_elapsed_ts +=
(rdtsc() - thread->profile_start_ts);
}
mcs_lock_unlock(&proc->profile_lock, &mcs_node);
}
int profile_alloc_events(struct thread *thread)
{
struct process *proc = thread->proc;
struct mcs_lock_node mcs_node;
if (!thread->profile_events) {
thread->profile_events = kmalloc(sizeof(*thread->profile_events) *
PROFILE_EVENT_MAX, IHK_MC_AP_NOWAIT);
if (!thread->profile_events) {
kprintf("%s: ERROR: allocating thread private profile counters\n",
__FUNCTION__);
return -ENOMEM;
}
memset(thread->profile_events, 0,
sizeof(*thread->profile_events) * PROFILE_EVENT_MAX);
}
mcs_lock_lock(&proc->profile_lock, &mcs_node);
if (!proc->profile_events) {
proc->profile_events = kmalloc(sizeof(*proc->profile_events) *
PROFILE_EVENT_MAX, IHK_MC_AP_NOWAIT);
if (!proc->profile_events) {
kprintf("%s: ERROR: allocating proc private profile counters\n",
__FUNCTION__);
mcs_lock_unlock(&proc->profile_lock, &mcs_node);
return -ENOMEM;
}
memset(proc->profile_events, 0,
sizeof(*thread->profile_events) * PROFILE_EVENT_MAX);
}
mcs_lock_unlock(&proc->profile_lock, &mcs_node);
return 0;
}
void profile_dealloc_thread_events(struct thread *thread)
{
kfree(thread->profile_events);
}
void profile_dealloc_proc_events(struct process *proc)
{
kfree(proc->profile_events);
}
void static profile_clear_process(struct process *proc)
{
proc->profile_elapsed_ts = 0;
if (!proc->profile_events) return;
memset(proc->profile_events, 0,
sizeof(*proc->profile_events) * PROFILE_EVENT_MAX);
}
void static profile_clear_thread(struct thread *thread)
{
thread->profile_start_ts = 0;
thread->profile_elapsed_ts = 0;
if (!thread->profile_events) return;
memset(thread->profile_events, 0,
sizeof(*thread->profile_events) * PROFILE_EVENT_MAX);
}
int do_profile(int flag)
{
struct thread *thread = cpu_local_var(current);
struct process *proc = thread->proc;
unsigned long now_ts = rdtsc();
/* Job level? */
if (flag & PROF_JOB) {
dkprintf("%s: JOB %d, flag: 0x%lx\n",
__FUNCTION__, proc->nr_processes, flag);
if (flag & PROF_PRINT) {
struct mcs_rwlock_node lock;
struct thread *_thread;
/* Accumulate events from all threads to process level */
mcs_rwlock_reader_lock_noirq(&proc->threads_lock, &lock);
list_for_each_entry(_thread, &proc->threads_list,
siblings_list) {
profile_accumulate_events(_thread, proc);
}
mcs_rwlock_reader_unlock_noirq(&proc->threads_lock, &lock);
/* Accumulate events to job level */
return profile_accumulate_and_print_job_events(proc);
}
}
/* Process level? */
else if (flag & PROF_PROC) {
struct mcs_rwlock_node lock;
struct thread *_thread;
dkprintf("%s: PID %d, flag: 0x%lx\n",
__FUNCTION__, proc->pid, flag);
/* Accumulate events from all threads */
mcs_rwlock_reader_lock_noirq(&proc->threads_lock, &lock);
list_for_each_entry(_thread, &proc->threads_list,
siblings_list) {
if (flag & PROF_PRINT) {
profile_accumulate_events(_thread, proc);
}
if (flag & PROF_CLEAR) {
profile_clear_thread(_thread);
}
if (flag & PROF_ON) {
_thread->profile = 1;
}
else if (flag & PROF_OFF) {
if (_thread->profile) {
_thread->profile = 0;
if (_thread->profile_start_ts) {
_thread->profile_elapsed_ts +=
(now_ts - _thread->profile_start_ts);
}
_thread->profile_start_ts = 0;
}
}
}
mcs_rwlock_reader_unlock_noirq(&proc->threads_lock, &lock);
if (flag & PROF_PRINT) {
profile_print_proc_stats(proc);
}
if (flag & PROF_CLEAR) {
profile_clear_process(proc);
}
/* Make sure future threads profile as well */
if (flag & PROF_ON) {
if (!proc->profile) {
proc->profile = 1;
}
}
else if (flag & PROF_OFF) {
proc->profile = 0;
}
}
/* Thread level */
else {
dkprintf("%s: TID %d, flag: 0x%lx\n",
__FUNCTION__, thread->tid, flag);
if (flag & PROF_PRINT) {
profile_print_thread_stats(thread);
}
if (flag & PROF_CLEAR) {
profile_clear_thread(thread);
/* If profiling, reset start and elapsed */
if (thread->profile) {
thread->profile_start_ts = 0;
thread->profile_elapsed_ts = 0;
}
}
if (flag & PROF_ON) {
if (!thread->profile) {
thread->profile = 1;
thread->profile_start_ts = 0;
}
}
else if (flag & PROF_OFF) {
if (thread->profile) {
thread->profile = 0;
if (thread->profile_start_ts) {
thread->profile_elapsed_ts +=
(now_ts - thread->profile_start_ts);
}
thread->profile_start_ts = 0;
}
}
}
return 0;
}
SYSCALL_DECLARE(profile)
{
int flag = (int)ihk_mc_syscall_arg0(ctx);
return do_profile(flag);
}
#endif // PROFILE_ENABLE

561
kernel/rbtree.c Normal file
View File

@@ -0,0 +1,561 @@
/*
Red Black Trees
(C) 1999 Andrea Arcangeli <andrea@suse.de>
(C) 2002 David Woodhouse <dwmw2@infradead.org>
(C) 2012 Michel Lespinasse <walken@google.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
linux/lib/rbtree.c
*/
#include <rbtree_augmented.h>
#define EXPORT_SYMBOL(x)
/*
* red-black trees properties: http://en.wikipedia.org/wiki/Rbtree
*
* 1) A node is either red or black
* 2) The root is black
* 3) All leaves (NULL) are black
* 4) Both children of every red node are black
* 5) Every simple path from root to leaves contains the same number
* of black nodes.
*
* 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
* consecutive red nodes in a path and every red node is therefore followed by
* a black. So if B is the number of black nodes on every simple path (as per
* 5), then the longest possible path due to 4 is 2B.
*
* We shall indicate color with case, where black nodes are uppercase and red
* nodes will be lowercase. Unknown color nodes shall be drawn as red within
* parentheses and have some accompanying text comment.
*/
static inline void rb_set_black(struct rb_node *rb)
{
rb->__rb_parent_color |= RB_BLACK;
}
static inline struct rb_node *rb_red_parent(struct rb_node *red)
{
return (struct rb_node *)red->__rb_parent_color;
}
/*
* Helper function for rotations:
* - old's parent and color get assigned to new
* - old gets assigned new as a parent and 'color' as a color.
*/
static inline void
__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
struct rb_root *root, int color)
{
struct rb_node *parent = rb_parent(old);
new->__rb_parent_color = old->__rb_parent_color;
rb_set_parent_color(old, new, color);
__rb_change_child(old, new, parent, root);
}
static __always_inline void
__rb_insert(struct rb_node *node, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
while (true) {
/*
* Loop invariant: node is red
*
* If there is a black parent, we are done.
* Otherwise, take some corrective action as we don't
* want a red root or two consecutive red nodes.
*/
if (!parent) {
rb_set_parent_color(node, NULL, RB_BLACK);
break;
} else if (rb_is_black(parent))
break;
gparent = rb_red_parent(parent);
tmp = gparent->rb_right;
if (parent != tmp) { /* parent == gparent->rb_left */
if (tmp && rb_is_red(tmp)) {
/*
* Case 1 - color flips
*
* G g
* / \ / \
* p u --> P U
* / /
* n N
*
* However, since g's parent might be red, and
* 4) does not allow this, we need to recurse
* at g.
*/
rb_set_parent_color(tmp, gparent, RB_BLACK);
rb_set_parent_color(parent, gparent, RB_BLACK);
node = gparent;
parent = rb_parent(node);
rb_set_parent_color(node, parent, RB_RED);
continue;
}
tmp = parent->rb_right;
if (node == tmp) {
/*
* Case 2 - left rotate at parent
*
* G G
* / \ / \
* p U --> n U
* \ /
* n p
*
* This still leaves us in violation of 4), the
* continuation into Case 3 will fix that.
*/
parent->rb_right = tmp = node->rb_left;
node->rb_left = parent;
if (tmp)
rb_set_parent_color(tmp, parent,
RB_BLACK);
rb_set_parent_color(parent, node, RB_RED);
augment_rotate(parent, node);
parent = node;
tmp = node->rb_right;
}
/*
* Case 3 - right rotate at gparent
*
* G P
* / \ / \
* p U --> n g
* / \
* n U
*/
gparent->rb_left = tmp; /* == parent->rb_right */
parent->rb_right = gparent;
if (tmp)
rb_set_parent_color(tmp, gparent, RB_BLACK);
__rb_rotate_set_parents(gparent, parent, root, RB_RED);
augment_rotate(gparent, parent);
break;
} else {
tmp = gparent->rb_left;
if (tmp && rb_is_red(tmp)) {
/* Case 1 - color flips */
rb_set_parent_color(tmp, gparent, RB_BLACK);
rb_set_parent_color(parent, gparent, RB_BLACK);
node = gparent;
parent = rb_parent(node);
rb_set_parent_color(node, parent, RB_RED);
continue;
}
tmp = parent->rb_left;
if (node == tmp) {
/* Case 2 - right rotate at parent */
parent->rb_left = tmp = node->rb_right;
node->rb_right = parent;
if (tmp)
rb_set_parent_color(tmp, parent,
RB_BLACK);
rb_set_parent_color(parent, node, RB_RED);
augment_rotate(parent, node);
parent = node;
tmp = node->rb_left;
}
/* Case 3 - left rotate at gparent */
gparent->rb_right = tmp; /* == parent->rb_left */
parent->rb_left = gparent;
if (tmp)
rb_set_parent_color(tmp, gparent, RB_BLACK);
__rb_rotate_set_parents(gparent, parent, root, RB_RED);
augment_rotate(gparent, parent);
break;
}
}
}
/*
* Inline version for rb_erase() use - we want to be able to inline
* and eliminate the dummy_rotate callback there
*/
static __always_inline void
____rb_erase_color(struct rb_node *parent, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
while (true) {
/*
* Loop invariants:
* - node is black (or NULL on first iteration)
* - node is not the root (parent is not NULL)
* - All leaf paths going through parent and node have a
* black node count that is 1 lower than other leaf paths.
*/
sibling = parent->rb_right;
if (node != sibling) { /* node == parent->rb_left */
if (rb_is_red(sibling)) {
/*
* Case 1 - left rotate at parent
*
* P S
* / \ / \
* N s --> p Sr
* / \ / \
* Sl Sr N Sl
*/
parent->rb_right = tmp1 = sibling->rb_left;
sibling->rb_left = parent;
rb_set_parent_color(tmp1, parent, RB_BLACK);
__rb_rotate_set_parents(parent, sibling, root,
RB_RED);
augment_rotate(parent, sibling);
sibling = tmp1;
}
tmp1 = sibling->rb_right;
if (!tmp1 || rb_is_black(tmp1)) {
tmp2 = sibling->rb_left;
if (!tmp2 || rb_is_black(tmp2)) {
/*
* Case 2 - sibling color flip
* (p could be either color here)
*
* (p) (p)
* / \ / \
* N S --> N s
* / \ / \
* Sl Sr Sl Sr
*
* This leaves us violating 5) which
* can be fixed by flipping p to black
* if it was red, or by recursing at p.
* p is red when coming from Case 1.
*/
rb_set_parent_color(sibling, parent,
RB_RED);
if (rb_is_red(parent))
rb_set_black(parent);
else {
node = parent;
parent = rb_parent(node);
if (parent)
continue;
}
break;
}
/*
* Case 3 - right rotate at sibling
* (p could be either color here)
*
* (p) (p)
* / \ / \
* N S --> N Sl
* / \ \
* sl Sr s
* \
* Sr
*/
sibling->rb_left = tmp1 = tmp2->rb_right;
tmp2->rb_right = sibling;
parent->rb_right = tmp2;
if (tmp1)
rb_set_parent_color(tmp1, sibling,
RB_BLACK);
augment_rotate(sibling, tmp2);
tmp1 = sibling;
sibling = tmp2;
}
/*
* Case 4 - left rotate at parent + color flips
* (p and sl could be either color here.
* After rotation, p becomes black, s acquires
* p's color, and sl keeps its color)
*
* (p) (s)
* / \ / \
* N S --> P Sr
* / \ / \
* (sl) sr N (sl)
*/
parent->rb_right = tmp2 = sibling->rb_left;
sibling->rb_left = parent;
rb_set_parent_color(tmp1, sibling, RB_BLACK);
if (tmp2)
rb_set_parent(tmp2, parent);
__rb_rotate_set_parents(parent, sibling, root,
RB_BLACK);
augment_rotate(parent, sibling);
break;
} else {
sibling = parent->rb_left;
if (rb_is_red(sibling)) {
/* Case 1 - right rotate at parent */
parent->rb_left = tmp1 = sibling->rb_right;
sibling->rb_right = parent;
rb_set_parent_color(tmp1, parent, RB_BLACK);
__rb_rotate_set_parents(parent, sibling, root,
RB_RED);
augment_rotate(parent, sibling);
sibling = tmp1;
}
tmp1 = sibling->rb_left;
if (!tmp1 || rb_is_black(tmp1)) {
tmp2 = sibling->rb_right;
if (!tmp2 || rb_is_black(tmp2)) {
/* Case 2 - sibling color flip */
rb_set_parent_color(sibling, parent,
RB_RED);
if (rb_is_red(parent))
rb_set_black(parent);
else {
node = parent;
parent = rb_parent(node);
if (parent)
continue;
}
break;
}
/* Case 3 - right rotate at sibling */
sibling->rb_right = tmp1 = tmp2->rb_left;
tmp2->rb_left = sibling;
parent->rb_left = tmp2;
if (tmp1)
rb_set_parent_color(tmp1, sibling,
RB_BLACK);
augment_rotate(sibling, tmp2);
tmp1 = sibling;
sibling = tmp2;
}
/* Case 4 - left rotate at parent + color flips */
parent->rb_left = tmp2 = sibling->rb_right;
sibling->rb_right = parent;
rb_set_parent_color(tmp1, sibling, RB_BLACK);
if (tmp2)
rb_set_parent(tmp2, parent);
__rb_rotate_set_parents(parent, sibling, root,
RB_BLACK);
augment_rotate(parent, sibling);
break;
}
}
}
/* Non-inline version for rb_erase_augmented() use */
void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
____rb_erase_color(parent, root, augment_rotate);
}
EXPORT_SYMBOL(__rb_erase_color);
/*
* Non-augmented rbtree manipulation functions.
*
* We use dummy augmented callbacks here, and have the compiler optimize them
* out of the rb_insert_color() and rb_erase() function definitions.
*/
static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
static const struct rb_augment_callbacks dummy_callbacks = {
dummy_propagate, dummy_copy, dummy_rotate
};
void rb_insert_color(struct rb_node *node, struct rb_root *root)
{
__rb_insert(node, root, dummy_rotate);
}
EXPORT_SYMBOL(rb_insert_color);
void rb_erase(struct rb_node *node, struct rb_root *root)
{
struct rb_node *rebalance;
rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
if (rebalance)
____rb_erase_color(rebalance, root, dummy_rotate);
}
EXPORT_SYMBOL(rb_erase);
/*
* Augmented rbtree manipulation functions.
*
* This instantiates the same __always_inline functions as in the non-augmented
* case, but this time with user-defined callbacks.
*/
void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
__rb_insert(node, root, augment_rotate);
}
EXPORT_SYMBOL(__rb_insert_augmented);
/*
* This function returns the first node (in sort order) of the tree.
*/
struct rb_node *rb_first(const struct rb_root *root)
{
struct rb_node *n;
n = root->rb_node;
if (!n)
return NULL;
while (n->rb_left)
n = n->rb_left;
return n;
}
EXPORT_SYMBOL(rb_first);
struct rb_node *rb_last(const struct rb_root *root)
{
struct rb_node *n;
n = root->rb_node;
if (!n)
return NULL;
while (n->rb_right)
n = n->rb_right;
return n;
}
EXPORT_SYMBOL(rb_last);
struct rb_node *rb_next(const struct rb_node *node)
{
struct rb_node *parent;
if (RB_EMPTY_NODE(node))
return NULL;
/*
* If we have a right-hand child, go down and then left as far
* as we can.
*/
if (node->rb_right) {
node = node->rb_right;
while (node->rb_left)
node=node->rb_left;
return (struct rb_node *)node;
}
/*
* No right-hand children. Everything down and left is smaller than us,
* so any 'next' node must be in the general direction of our parent.
* Go up the tree; any time the ancestor is a right-hand child of its
* parent, keep going up. First time it's a left-hand child of its
* parent, said parent is our 'next' node.
*/
while ((parent = rb_parent(node)) && node == parent->rb_right)
node = parent;
return parent;
}
EXPORT_SYMBOL(rb_next);
struct rb_node *rb_prev(const struct rb_node *node)
{
struct rb_node *parent;
if (RB_EMPTY_NODE(node))
return NULL;
/*
* If we have a left-hand child, go down and then right as far
* as we can.
*/
if (node->rb_left) {
node = node->rb_left;
while (node->rb_right)
node=node->rb_right;
return (struct rb_node *)node;
}
/*
* No left-hand children. Go up till we find an ancestor which
* is a right-hand child of its parent.
*/
while ((parent = rb_parent(node)) && node == parent->rb_left)
node = parent;
return parent;
}
EXPORT_SYMBOL(rb_prev);
void rb_replace_node(struct rb_node *victim, struct rb_node *new,
struct rb_root *root)
{
struct rb_node *parent = rb_parent(victim);
/* Set the surrounding nodes to point to the replacement */
__rb_change_child(victim, new, parent, root);
if (victim->rb_left)
rb_set_parent(victim->rb_left, new);
if (victim->rb_right)
rb_set_parent(victim->rb_right, new);
/* Copy the pointers/colour from the victim to the replacement */
*new = *victim;
}
EXPORT_SYMBOL(rb_replace_node);
static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
{
for (;;) {
if (node->rb_left)
node = node->rb_left;
else if (node->rb_right)
node = node->rb_right;
else
return (struct rb_node *)node;
}
}
struct rb_node *rb_next_postorder(const struct rb_node *node)
{
const struct rb_node *parent;
if (!node)
return NULL;
parent = rb_parent(node);
/* If we're sitting on node, we've already seen our children */
if (parent && node == parent->rb_left && parent->rb_right) {
/* If we are the parent's left node, go to the parent's right
* node then all the way down to the left */
return rb_left_deepest_node(parent->rb_right);
} else
/* Otherwise we are the parent's right node, and the parent
* should be next */
return (struct rb_node *)parent;
}
EXPORT_SYMBOL(rb_next_postorder);
struct rb_node *rb_first_postorder(const struct rb_root *root)
{
if (!root->rb_node)
return NULL;
return rb_left_deepest_node(root->rb_node);
}
EXPORT_SYMBOL(rb_first_postorder);

View File

@@ -179,6 +179,7 @@ int shmobj_create(struct shmid_ds *ds, struct memobj **objp)
memset(obj, 0, sizeof(*obj));
obj->memobj.ops = &shmobj_ops;
obj->memobj.size = ds->shm_segsz;
obj->ds = *ds;
obj->ds.shm_perm.seq = the_seq++;
obj->ds.shm_nattch = 1;
@@ -240,14 +241,24 @@ void shmobj_destroy(struct shmobj *obj)
npages = (size_t)1 << (obj->pgshift - PAGE_SHIFT);
for (;;) {
struct page *page;
int count;
void *page_va;
page = page_list_first(obj);
if (!page) {
break;
}
page_list_remove(obj, page);
page_va = phys_to_virt(page_to_phys(page));
if (ihk_atomic_read(&page->count) != 1) {
kprintf("%s: WARNING: page count for phys 0x%lx is invalid\n",
__FUNCTION__, page->phys);
}
if (page_unmap(page)) {
ihk_mc_free_pages_user(page_va, npages);
}
#if 0
dkprintf("shmobj_destroy(%p):"
"release page. %p %#lx %d %d",
obj, page, page_to_phys(page),
@@ -265,7 +276,8 @@ void shmobj_destroy(struct shmobj *obj)
}
page->mode = PM_NONE;
free_pages(phys_to_virt(page_to_phys(page)), npages);
ihk_mc_free_pages(phys_to_virt(page_to_phys(page)), npages);
#endif
}
if (obj->index < 0) {
kfree(obj);
@@ -394,7 +406,7 @@ static int shmobj_get_page(struct memobj *memobj, off_t off, int p2align,
page = page_list_lookup(obj, off);
if (!page) {
npages = 1 << p2align;
virt = ihk_mc_alloc_aligned_pages(npages, p2align,
virt = ihk_mc_alloc_aligned_pages_user(npages, p2align,
IHK_MC_AP_NOWAIT);
if (!virt) {
error = -ENOMEM;
@@ -404,7 +416,7 @@ static int shmobj_get_page(struct memobj *memobj, off_t off, int p2align,
goto out;
}
phys = virt_to_phys(virt);
page = phys_to_page(phys);
page = phys_to_page_insert_hash(phys);
if (page->mode != PM_NONE) {
fkprintf("shmobj_get_page(%p,%#lx,%d,%p):"
"page %p %#lx %d %d %#lx\n",
@@ -431,7 +443,7 @@ static int shmobj_get_page(struct memobj *memobj, off_t off, int p2align,
out:
memobj_unlock(&obj->memobj);
if (virt) {
ihk_mc_free_pages(virt, npages);
ihk_mc_free_pages_user(virt, npages);
}
dkprintf("shmobj_get_page(%p,%#lx,%d,%p):%d\n",
memobj, off, p2align, physp, error);
@@ -455,7 +467,8 @@ static int shmobj_invalidate_page(struct memobj *memobj, uintptr_t phys,
if (ihk_atomic_read(&page->count) == 1) {
if (page_unmap(page)) {
ihk_mc_free_pages(phys_to_virt(phys), pgsize/PAGE_SIZE);
ihk_mc_free_pages_user(phys_to_virt(phys),
pgsize/PAGE_SIZE);
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -113,7 +113,7 @@ sysfs_createf(struct sysfs_ops *ops, void *instance, int mode,
packet.msg = SCD_MSG_SYSFS_REQ_CREATE;
packet.sysfs_arg1 = virt_to_phys(param);
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
error = ihk_ikc_send(cpu_local_var(ikc2linux), &packet, 0);
if (error) {
ekprintf("sysfs_createf:ihk_ikc_send failed. %d\n", error);
goto out;
@@ -183,7 +183,7 @@ sysfs_mkdirf(sysfs_handle_t *dirhp, const char *fmt, ...)
packet.msg = SCD_MSG_SYSFS_REQ_MKDIR;
packet.sysfs_arg1 = virt_to_phys(param);
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
error = ihk_ikc_send(cpu_local_var(ikc2linux), &packet, 0);
if (error) {
ekprintf("sysfs_mkdirf:ihk_ikc_send failed. %d\n", error);
goto out;
@@ -257,7 +257,7 @@ sysfs_symlinkf(sysfs_handle_t targeth, const char *fmt, ...)
packet.msg = SCD_MSG_SYSFS_REQ_SYMLINK;
packet.sysfs_arg1 = virt_to_phys(param);
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
error = ihk_ikc_send(cpu_local_var(ikc2linux), &packet, 0);
if (error) {
ekprintf("sysfs_symlinkf:ihk_ikc_send failed. %d\n", error);
goto out;
@@ -328,7 +328,7 @@ sysfs_lookupf(sysfs_handle_t *objhp, const char *fmt, ...)
packet.msg = SCD_MSG_SYSFS_REQ_LOOKUP;
packet.sysfs_arg1 = virt_to_phys(param);
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
error = ihk_ikc_send(cpu_local_var(ikc2linux), &packet, 0);
if (error) {
ekprintf("sysfs_lookupf:ihk_ikc_send failed. %d\n", error);
goto out;
@@ -402,7 +402,7 @@ sysfs_unlinkf(int flags, const char *fmt, ...)
packet.msg = SCD_MSG_SYSFS_REQ_UNLINK;
packet.sysfs_arg1 = virt_to_phys(param);
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
error = ihk_ikc_send(cpu_local_var(ikc2linux), &packet, 0);
if (error) {
ekprintf("sysfs_unlinkf:ihk_ikc_send failed. %d\n", error);
goto out;
@@ -462,7 +462,7 @@ sysfss_req_show(long nodeh, struct sysfs_ops *ops, void *instance)
packet.sysfs_arg1 = nodeh;
packet.sysfs_arg2 = ssize;
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
error = ihk_ikc_send(cpu_local_var(ikc2linux), &packet, 0);
if (error) {
ekprintf("sysfss_req_show:ihk_ikc_send failed. %d\n", error);
/* through */
@@ -508,7 +508,7 @@ sysfss_req_store(long nodeh, struct sysfs_ops *ops, void *instance,
packet.sysfs_arg1 = nodeh;
packet.sysfs_arg2 = ssize;
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
error = ihk_ikc_send(cpu_local_var(ikc2linux), &packet, 0);
if (error) {
ekprintf("sysfss_req_store:ihk_ikc_send failed. %d\n", error);
/* through */
@@ -539,7 +539,7 @@ sysfss_req_release(long nodeh, struct sysfs_ops *ops, void *instance)
packet.err = 0;
packet.sysfs_arg1 = nodeh;
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
error = ihk_ikc_send(cpu_local_var(ikc2linux), &packet, 0);
if (error) {
ekprintf("sysfss_req_release:ihk_ikc_send failed. %d\n",
error);
@@ -623,7 +623,7 @@ sysfs_init(void)
packet.msg = SCD_MSG_SYSFS_REQ_SETUP;
packet.sysfs_arg1 = virt_to_phys(param);
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
error = ihk_ikc_send(cpu_local_var(ikc2linux), &packet, 0);
if (error) {
ekprintf("sysfs_init:ihk_ikc_send failed. %d\n", error);
goto out;

View File

@@ -54,136 +54,75 @@ void init_timers(void)
}
uint64_t schedule_timeout(uint64_t timeout)
{
struct waitq_entry my_wait;
struct timer my_timer;
{
struct thread *thread = cpu_local_var(current);
int irqstate;
int spin_sleep;
irqstate = ihk_mc_spinlock_lock(&thread->spin_sleep_lock);
dkprintf("schedule_timeout() spin sleep timeout: %lu\n", timeout);
spin_sleep = ++thread->spin_sleep;
ihk_mc_spinlock_unlock(&thread->spin_sleep_lock, irqstate);
long irqstate;
/* Spin sleep.. */
for (;;) {
int need_schedule;
struct cpu_local_var *v = get_this_cpu_local_var();
uint64_t t_s = rdtsc();
uint64_t t_e;
int spin_over = 0;
irqstate = ihk_mc_spinlock_lock(&thread->spin_sleep_lock);
/* Woken up by someone? */
if (thread->spin_sleep < 1) {
if (thread->spin_sleep == 0) {
t_e = rdtsc();
spin_over = 1;
if ((t_e - t_s) < timeout) {
timeout -= (t_e - t_s);
}
else {
timeout = 1;
}
ihk_mc_spinlock_unlock(&thread->spin_sleep_lock, irqstate);
break;
}
ihk_mc_spinlock_unlock(&thread->spin_sleep_lock, irqstate);
if (!spin_over) {
t_s = rdtsc();
int need_schedule;
struct cpu_local_var *v = get_this_cpu_local_var();
int irqstate = ihk_mc_spinlock_lock(&(v->runq_lock));
need_schedule = v->runq_len > 1 ? 1 : 0;
/* Give a chance to another thread (if any) in case the core is
* oversubscribed, but make sure we will be re-scheduled */
irqstate = ihk_mc_spinlock_lock(&(v->runq_lock));
need_schedule = v->runq_len > 1 ? 1 : 0;
if (need_schedule) {
xchg4(&(cpu_local_var(current)->status), PS_RUNNING);
ihk_mc_spinlock_unlock(&(v->runq_lock), irqstate);
schedule();
/* Give a chance to another thread (if any) in case the core is
* oversubscribed, but make sure we will be re-scheduled */
if (need_schedule) {
xchg4(&(cpu_local_var(current)->status), PS_RUNNING);
schedule();
xchg4(&(cpu_local_var(current)->status),
PS_INTERRUPTIBLE);
}
else {
/* Spin wait */
while ((rdtsc() - t_s) < LOOP_TIMEOUT) {
cpu_pause();
}
if (timeout < LOOP_TIMEOUT) {
timeout = 0;
spin_over = 1;
}
else {
timeout -= LOOP_TIMEOUT;
}
}
/* Recheck if woken */
continue;
}
else {
ihk_mc_spinlock_unlock(&(v->runq_lock), irqstate);
}
if (spin_over) {
dkprintf("schedule_timeout() spin woken up, timeout: %lu\n",
timeout);
/* Give a chance to another thread (if any) in case we timed out,
* but make sure we will be re-scheduled */
if (timeout == 0) {
int need_schedule;
struct cpu_local_var *v = get_this_cpu_local_var();
int irqstate =
ihk_mc_spinlock_lock(&(v->runq_lock));
need_schedule = v->runq_len > 1 ? 1 : 0;
ihk_mc_spinlock_unlock(&(v->runq_lock), irqstate);
/* Spin wait */
while ((rdtsc() - t_s) < LOOP_TIMEOUT) {
cpu_pause();
}
if (need_schedule) {
xchg4(&(cpu_local_var(current)->status), PS_RUNNING);
schedule();
xchg4(&(cpu_local_var(current)->status),
PS_INTERRUPTIBLE);
}
}
/* Time out? */
if (timeout < LOOP_TIMEOUT) {
timeout = 0;
/* We are not sleeping any more */
irqstate = ihk_mc_spinlock_lock(&thread->spin_sleep_lock);
if (spin_sleep == thread->spin_sleep) {
--thread->spin_sleep;
}
thread->spin_sleep = 0;
ihk_mc_spinlock_unlock(&thread->spin_sleep_lock, irqstate);
return timeout;
break;
}
else {
timeout -= LOOP_TIMEOUT;
}
}
/* Init waitq and wait entry for this timer */
my_timer.timeout = (timeout < LOOP_TIMEOUT) ? LOOP_TIMEOUT : timeout;
my_timer.thread = cpu_local_var(current);
waitq_init(&my_timer.processes);
waitq_init_entry(&my_wait, cpu_local_var(current));
/* Add ourself to the timer queue */
ihk_mc_spinlock_lock_noirq(&timers_lock);
list_add_tail(&my_timer.list, &timers);
dkprintf("schedule_timeout() sleep timeout: %lu\n", my_timer.timeout);
/* Add ourself to the waitqueue and sleep */
waitq_prepare_to_wait(&my_timer.processes, &my_wait, PS_INTERRUPTIBLE);
ihk_mc_spinlock_unlock_noirq(&timers_lock);
schedule();
waitq_finish_wait(&my_timer.processes, &my_wait);
ihk_mc_spinlock_lock_noirq(&timers_lock);
/* Waken up by someone else then timeout? */
if (my_timer.timeout) {
list_del(&my_timer.list);
}
ihk_mc_spinlock_unlock_noirq(&timers_lock);
dkprintf("schedule_timeout() woken up, timeout: %lu\n",
my_timer.timeout);
return my_timer.timeout;
return timeout;
}

View File

@@ -22,6 +22,13 @@ default_wake_function(waitq_entry_t *entry, unsigned mode,
return sched_wakeup_thread(entry->private, PS_NORMAL);
}
int
locked_wake_function(waitq_entry_t *entry, unsigned mode,
int flags, void *key)
{
return sched_wakeup_thread_locked(entry->private, PS_NORMAL);
}
void
waitq_init(waitq_t *waitq)
{

2261
kernel/xpmem.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -102,6 +102,7 @@ static int alloc_zeroobj(void)
memset(obj, 0, sizeof(*obj));
obj->memobj.ops = &zeroobj_ops;
obj->memobj.size = 0;
page_list_init(obj);
ihk_mc_spinlock_init(&obj->memobj.lock);
@@ -112,7 +113,7 @@ static int alloc_zeroobj(void)
goto out;
}
phys = virt_to_phys(virt);
page = phys_to_page(phys);
page = phys_to_page_insert_hash(phys);
if (page->mode != PM_NONE) {
fkprintf("alloc_zeroobj():"

View File

@@ -1,8 +1,17 @@
#include <ihk/debug.h>
#include <ihk/cpu.h>
#include <cls.h>
#include <ihk/rusage.h>
extern struct cpu_local_var *clv;
void panic(const char *msg)
{
if (clv) {
struct ihk_os_cpu_monitor *monitor = cpu_local_var(monitor);
monitor->status = IHK_OS_MONITOR_PANIC;
}
cpu_disable_interrupt();
kprintf(msg);

1179
lib/bitmap.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -140,3 +140,58 @@ found:
return result + ffz(tmp);
}
/**
* hweightN - returns the hamming weight of a N-bit word
* @x: the word to weigh
*
* The Hamming Weight of a number is the total number of bits set in it.
*/
unsigned int __sw_hweight32(unsigned int w)
{
#ifdef ARCH_HAS_FAST_MULTIPLIER
w -= (w >> 1) & 0x55555555;
w = (w & 0x33333333) + ((w >> 2) & 0x33333333);
w = (w + (w >> 4)) & 0x0f0f0f0f;
return (w * 0x01010101) >> 24;
#else
unsigned int res = w - ((w >> 1) & 0x55555555);
res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
res = (res + (res >> 4)) & 0x0F0F0F0F;
res = res + (res >> 8);
return (res + (res >> 16)) & 0x000000FF;
#endif
}
unsigned int __sw_hweight16(unsigned int w)
{
unsigned int res = w - ((w >> 1) & 0x5555);
res = (res & 0x3333) + ((res >> 2) & 0x3333);
res = (res + (res >> 4)) & 0x0F0F;
return (res + (res >> 8)) & 0x00FF;
}
unsigned int __sw_hweight8(unsigned int w)
{
unsigned int res = w - ((w >> 1) & 0x55);
res = (res & 0x33) + ((res >> 2) & 0x33);
return (res + (res >> 4)) & 0x0F;
}
unsigned long __sw_hweight64(uint64_t w)
{
#ifdef ARCH_HAS_FAST_MULTIPLIER
w -= (w >> 1) & 0x5555555555555555ul;
w = (w & 0x3333333333333333ul) + ((w >> 2) & 0x3333333333333333ul);
w = (w + (w >> 4)) & 0x0f0f0f0f0f0f0f0ful;
return (w * 0x0101010101010101ul) >> 56;
#else
uint64_t res = w - ((w >> 1) & 0x5555555555555555ul);
res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful;
res = res + (res >> 8);
res = res + (res >> 16);
return (res + (res >> 32)) & 0x00000000000000FFul;
#endif
}

307
lib/include/bitmap.h Normal file
View File

@@ -0,0 +1,307 @@
#ifndef __LINUX_BITMAP_H
#define __LINUX_BITMAP_H
#include <types.h>
#include <bitops.h>
#include <string.h>
/*
* bitmaps provide bit arrays that consume one or more unsigned
* longs. The bitmap interface and available operations are listed
* here, in bitmap.h
*
* Function implementations generic to all architectures are in
* lib/bitmap.c. Functions implementations that are architecture
* specific are in various include/asm-<arch>/bitops.h headers
* and other arch/<arch> specific files.
*
* See lib/bitmap.c for more details.
*/
/*
* The available bitmap operations and their rough meaning in the
* case that the bitmap is a single unsigned long are thus:
*
* Note that nbits should be always a compile time evaluable constant.
* Otherwise many inlines will generate horrible code.
*
* bitmap_zero(dst, nbits) *dst = 0UL
* bitmap_fill(dst, nbits) *dst = ~0UL
* bitmap_copy(dst, src, nbits) *dst = *src
* bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2
* bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2
* bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2
* bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2)
* bitmap_complement(dst, src, nbits) *dst = ~(*src)
* bitmap_equal(src1, src2, nbits) Are *src1 and *src2 equal?
* bitmap_intersects(src1, src2, nbits) Do *src1 and *src2 overlap?
* bitmap_subset(src1, src2, nbits) Is *src1 a subset of *src2?
* bitmap_empty(src, nbits) Are all bits zero in *src?
* bitmap_full(src, nbits) Are all bits set in *src?
* bitmap_weight(src, nbits) Hamming Weight: number set bits
* bitmap_set(dst, pos, nbits) Set specified bit area
* bitmap_clear(dst, pos, nbits) Clear specified bit area
* bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area
* bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n
* bitmap_shift_left(dst, src, n, nbits) *dst = *src << n
* bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src)
* bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit)
* bitmap_onto(dst, orig, relmap, nbits) *dst = orig relative to relmap
* bitmap_fold(dst, orig, sz, nbits) dst bits = orig bits mod sz
* bitmap_scnprintf(buf, len, src, nbits) Print bitmap src to buf
* bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf
* bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf
* bitmap_scnlistprintf(buf, len, src, nbits) Print bitmap src as list to buf
* bitmap_parselist(buf, dst, nbits) Parse bitmap dst from kernel buf
* bitmap_parselist_user(buf, dst, nbits) Parse bitmap dst from user buf
* bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region
* bitmap_release_region(bitmap, pos, order) Free specified bit region
* bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region
*/
/*
* Also the following operations in asm/bitops.h apply to bitmaps.
*
* set_bit(bit, addr) *addr |= bit
* clear_bit(bit, addr) *addr &= ~bit
* change_bit(bit, addr) *addr ^= bit
* test_bit(bit, addr) Is bit set in *addr?
* test_and_set_bit(bit, addr) Set bit and return old value
* test_and_clear_bit(bit, addr) Clear bit and return old value
* test_and_change_bit(bit, addr) Change bit and return old value
* find_first_zero_bit(addr, nbits) Position first zero bit in *addr
* find_first_bit(addr, nbits) Position first set bit in *addr
* find_next_zero_bit(addr, nbits, bit) Position next zero bit in *addr >= bit
* find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit
*/
/*
* The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used
* to declare an array named 'name' of just enough unsigned longs to
* contain all bit positions from 0 to 'bits' - 1.
*/
/*
* lib/bitmap.c provides these functions:
*/
#define __user
#define __force
#define u32 uint32_t
extern int __bitmap_empty(const unsigned long *bitmap, int bits);
extern int __bitmap_full(const unsigned long *bitmap, int bits);
extern int __bitmap_equal(const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern void __bitmap_complement(unsigned long *dst, const unsigned long *src,
int bits);
extern void __bitmap_shift_right(unsigned long *dst,
const unsigned long *src, int shift, int bits);
extern void __bitmap_shift_left(unsigned long *dst,
const unsigned long *src, int shift, int bits);
extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_intersects(const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_subset(const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_weight(const unsigned long *bitmap, int bits);
extern void bitmap_set(unsigned long *map, int i, int len);
extern void bitmap_clear(unsigned long *map, int start, int nr);
extern unsigned long bitmap_find_next_zero_area(unsigned long *map,
unsigned long size,
unsigned long start,
unsigned int nr,
unsigned long align_mask);
extern int bitmap_scnprintf(char *buf, unsigned int len,
const unsigned long *src, int nbits);
extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user,
unsigned long *dst, int nbits);
extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
unsigned long *dst, int nbits);
extern int bitmap_scnlistprintf(char *buf, unsigned int len,
const unsigned long *src, int nbits);
extern int bitmap_parselist(const char *buf, unsigned long *maskp,
int nmaskbits);
extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
unsigned long *dst, int nbits);
extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
const unsigned long *old, const unsigned long *new, int bits);
extern int bitmap_bitremap(int oldbit,
const unsigned long *old, const unsigned long *new, int bits);
extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
const unsigned long *relmap, int bits);
extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
int sz, int bits);
extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order);
extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits);
#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
#define BITMAP_LAST_WORD_MASK(nbits) \
( \
((nbits) % BITS_PER_LONG) ? \
(1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL \
)
#define small_const_nbits(nbits) \
(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
static inline void bitmap_zero(unsigned long *dst, int nbits)
{
if (small_const_nbits(nbits))
*dst = 0UL;
else {
int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
memset(dst, 0, len);
}
}
static inline void bitmap_fill(unsigned long *dst, int nbits)
{
size_t nlongs = BITS_TO_LONGS(nbits);
if (!small_const_nbits(nbits)) {
int len = (nlongs - 1) * sizeof(unsigned long);
memset(dst, 0xff, len);
}
dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits);
}
static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
int nbits)
{
if (small_const_nbits(nbits))
*dst = *src;
else {
int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
memcpy(dst, src, len);
}
}
static inline int bitmap_and(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return (*dst = *src1 & *src2) != 0;
return __bitmap_and(dst, src1, src2, nbits);
}
static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
*dst = *src1 | *src2;
else
__bitmap_or(dst, src1, src2, nbits);
}
static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
*dst = *src1 ^ *src2;
else
__bitmap_xor(dst, src1, src2, nbits);
}
static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return (*dst = *src1 & ~(*src2)) != 0;
return __bitmap_andnot(dst, src1, src2, nbits);
}
static inline void bitmap_complement(unsigned long *dst, const unsigned long *src,
int nbits)
{
if (small_const_nbits(nbits))
*dst = ~(*src) & BITMAP_LAST_WORD_MASK(nbits);
else
__bitmap_complement(dst, src, nbits);
}
static inline int bitmap_equal(const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return ! ((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_equal(src1, src2, nbits);
}
static inline int bitmap_intersects(const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
else
return __bitmap_intersects(src1, src2, nbits);
}
static inline int bitmap_subset(const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_subset(src1, src2, nbits);
}
static inline int bitmap_empty(const unsigned long *src, int nbits)
{
if (small_const_nbits(nbits))
return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_empty(src, nbits);
}
static inline int bitmap_full(const unsigned long *src, int nbits)
{
if (small_const_nbits(nbits))
return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_full(src, nbits);
}
static inline int bitmap_weight(const unsigned long *src, int nbits)
{
if (small_const_nbits(nbits))
return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
return __bitmap_weight(src, nbits);
}
static inline void bitmap_shift_right(unsigned long *dst,
const unsigned long *src, int n, int nbits)
{
if (small_const_nbits(nbits))
*dst = *src >> n;
else
__bitmap_shift_right(dst, src, n, nbits);
}
static inline void bitmap_shift_left(unsigned long *dst,
const unsigned long *src, int n, int nbits)
{
if (small_const_nbits(nbits))
*dst = (*src << n) & BITMAP_LAST_WORD_MASK(nbits);
else
__bitmap_shift_left(dst, src, n, nbits);
}
static inline int bitmap_parse(const char *buf, unsigned int buflen,
unsigned long *maskp, int nmaskbits)
{
return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits);
}
#endif /* __LINUX_BITMAP_H */

View File

@@ -27,6 +27,31 @@ unsigned long find_first_bit(const unsigned long *addr,
unsigned long find_first_zero_bit(const unsigned long *addr,
unsigned long size);
static inline int test_bit(int nr, const void *addr)
{
const uint32_t *p = (const uint32_t *)addr;
return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
}
extern unsigned int __sw_hweight32(unsigned int w);
extern unsigned int __sw_hweight16(unsigned int w);
extern unsigned int __sw_hweight8(unsigned int w);
extern unsigned long __sw_hweight64(uint64_t w);
static inline unsigned long hweight_long(unsigned long w)
{
return sizeof(w) == 4 ? __sw_hweight32(w) : __sw_hweight64(w);
}
#define BIT(nr) (1UL << (nr))
#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
#define BITS_PER_BYTE 8
#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
#define ALIGN(x, a) __ALIGN_KERNEL((x), (a))
#endif /*__ASSEMBLY__*/
#include <arch-bitops.h>

View File

@@ -49,12 +49,21 @@ struct ihk_mc_cpu_info {
int ncpus;
int *hw_ids;
int *nodes;
int *linux_cpu_ids;
int *ikc_cpus;
};
struct ihk_mc_cpu_info *ihk_mc_get_cpu_info(void);
void ihk_mc_boot_cpu(int cpuid, unsigned long pc);
int ihk_mc_get_processor_id(void);
int ihk_mc_get_hardware_processor_id(void);
int ihk_mc_get_numa_id(void);
int ihk_mc_get_nr_cores();
int ihk_mc_get_nr_linux_cores();
int ihk_mc_get_core(int id, unsigned long *linux_core_id, unsigned long *apic_id,
int *numa_id);
int ihk_mc_get_ikc_cpu(int id);
int ihk_mc_get_apicid(int linux_core_id);
void ihk_mc_delay_us(int us);
void ihk_mc_set_syscall_handler(long (*handler)(int, ihk_mc_user_context_t *));
@@ -95,10 +104,16 @@ enum ihk_asr_type {
IHK_ASR_X86_GS,
};
/* Local IRQ vectors */
#define LOCAL_TIMER_VECTOR 0xef
#define LOCAL_PERF_VECTOR 0xf0
#define IHK_TLB_FLUSH_IRQ_VECTOR_START 68
#define IHK_TLB_FLUSH_IRQ_VECTOR_SIZE 64
#define IHK_TLB_FLUSH_IRQ_VECTOR_END (IHK_TLB_FLUSH_IRQ_VECTOR_START + IHK_TLB_FLUSH_IRQ_VECTOR_SIZE)
#define LOCAL_SMP_FUNC_CALL_VECTOR 0xf1
int ihk_mc_arch_set_special_register(enum ihk_asr_type, unsigned long value);
int ihk_mc_arch_get_special_register(enum ihk_asr_type, unsigned long *value);

View File

@@ -34,18 +34,28 @@ enum ihk_mc_gma_type {
IHK_MC_RESERVED_AREA_END,
};
extern unsigned long bootstrap_mem_end;
enum ihk_mc_ma_type {
IHK_MC_MA_AVAILABLE,
IHK_MC_MA_RESERVED,
IHK_MC_MA_SPECIAL,
};
enum ihk_mc_ap_flag {
IHK_MC_AP_FLAG,
IHK_MC_AP_CRITICAL, /* panic on no memory space */
IHK_MC_AP_NOWAIT, /* error return on no memory space */
IHK_MC_AP_WAIT /* wait on no memory space */
};
typedef unsigned long ihk_mc_ap_flag;
/* Panic on no memory space */
#define IHK_MC_AP_CRITICAL 0x000001
/* Error return on no memory space */
#define IHK_MC_AP_NOWAIT 0x000002
/* Wait on no memory space */
#define IHK_MC_AP_WAIT 0x000004
#define IHK_MC_AP_USER 0x001000
#define IHK_MC_AP_BANDWIDTH 0x010000
#define IHK_MC_AP_LATENCY 0x020000
#define IHK_MC_PG_KERNEL 0
#define IHK_MC_PG_USER 1
enum ihk_mc_pt_prepare_flag {
IHK_MC_PT_FIRST_LEVEL,
@@ -72,14 +82,17 @@ struct ihk_mc_memory_node {
unsigned long ihk_mc_get_memory_address(enum ihk_mc_gma_type, int);
void ihk_mc_reserve_arch_pages(unsigned long start, unsigned long end,
void (*cb)(unsigned long, unsigned long, int));
struct ihk_page_allocator_desc;
void ihk_mc_reserve_arch_pages(struct ihk_page_allocator_desc *pa_allocator,
unsigned long start, unsigned long end,
void (*cb)(struct ihk_page_allocator_desc *,
unsigned long, unsigned long, int));
struct ihk_mc_pa_ops {
void *(*alloc_page)(int, int, enum ihk_mc_ap_flag);
void (*free_page)(void *, int);
void *(*alloc_page)(int, int, ihk_mc_ap_flag, int node, int is_user);
void (*free_page)(void *, int, int is_user);
void *(*alloc)(int, enum ihk_mc_ap_flag);
void *(*alloc)(int, ihk_mc_ap_flag);
void (*free)(void *);
};
@@ -100,14 +113,49 @@ void ihk_mc_map_micpa(unsigned long host_pa, unsigned long* mic_pa);
int ihk_mc_free_micpa(unsigned long mic_pa);
void ihk_mc_clean_micpa(void);
void *ihk_mc_alloc_aligned_pages(int npages, int p2align, enum ihk_mc_ap_flag flag);
void *ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag);
void ihk_mc_free_pages(void *p, int npages);
void *_ihk_mc_alloc_aligned_pages_node(int npages, int p2align,
ihk_mc_ap_flag flag, int node, int is_user, char *file, int line);
#define ihk_mc_alloc_aligned_pages_node(npages, p2align, flag, node) ({\
void *r = _ihk_mc_alloc_aligned_pages_node(npages, p2align, flag, node, IHK_MC_PG_KERNEL, __FILE__, __LINE__);\
r;\
})
#define ihk_mc_alloc_aligned_pages_node_user(npages, p2align, flag, node) ({\
void *r = _ihk_mc_alloc_aligned_pages_node(npages, p2align, flag, node, IHK_MC_PG_USER, __FILE__, __LINE__);\
r;\
})
#define ihk_mc_alloc_aligned_pages(npages, p2align, flag) ({\
void *r = _ihk_mc_alloc_aligned_pages_node(npages, p2align, flag, -1, IHK_MC_PG_KERNEL, __FILE__, __LINE__);\
r;\
})
#define ihk_mc_alloc_aligned_pages_user(npages, p2align, flag) ({\
void *r = _ihk_mc_alloc_aligned_pages_node(npages, p2align, flag, -1, IHK_MC_PG_USER, __FILE__, __LINE__);\
r;\
})
#define ihk_mc_alloc_pages(npages, flag) ({\
void *r = _ihk_mc_alloc_aligned_pages_node(npages, PAGE_P2ALIGN, flag, -1, IHK_MC_PG_KERNEL, __FILE__, __LINE__);\
r;\
})
#define ihk_mc_alloc_pages_user(npages, flag) ({\
void *r = _ihk_mc_alloc_aligned_pages_node(npages, PAGE_P2ALIGN, flag, -1, IHK_MC_PG_USER, __FILE__, __LINE__);\
r;\
})
void _ihk_mc_free_pages(void *ptr, int npages, int is_user, char *file, int line);
#define ihk_mc_free_pages(p, npages) ({\
_ihk_mc_free_pages(p, npages, IHK_MC_PG_KERNEL, __FILE__, __LINE__);\
})
#define ihk_mc_free_pages_user(p, npages) ({\
_ihk_mc_free_pages(p, npages, IHK_MC_PG_USER, __FILE__, __LINE__);\
})
void *ihk_mc_allocate(int size, int flag);
void ihk_mc_free(void *p);
void *arch_alloc_page(enum ihk_mc_ap_flag flag);
void arch_free_page(void *ptr);
int arch_get_smaller_page_size(void *args, size_t origsize, size_t *sizep, int *p2alignp);
typedef void *page_table_t;
@@ -143,22 +191,47 @@ int visit_pte_range(page_table_t pt, void *start, void *end, int pgshift,
int move_pte_range(page_table_t pt, struct process_vm *vm,
void *src, void *dest, size_t size);
struct page_table *ihk_mc_pt_create(enum ihk_mc_ap_flag ap_flag);
struct page_table *ihk_mc_pt_create(ihk_mc_ap_flag ap_flag);
/* XXX: proper use of struct page_table and page_table_t is unknown */
void ihk_mc_pt_destroy(struct page_table *pt);
void ihk_mc_load_page_table(struct page_table *pt);
int ihk_mc_pt_virt_to_phys_size(struct page_table *pt,
const void *virt,
unsigned long *phys,
unsigned long *size);
int ihk_mc_pt_virt_to_phys(struct page_table *pt,
const void *virt, unsigned long *phys);
uint64_t ihk_mc_pt_virt_to_pagemap(struct page_table *pt, unsigned long virt);
int ihk_mc_get_nr_numa_nodes(void);
struct smp_coreset;
int ihk_mc_get_numa_node(int id, int *linux_numa_id, int *type);
int ihk_mc_get_numa_distance(int i, int j);
int ihk_mc_get_nr_memory_chunks(void);
int ihk_mc_get_memory_chunk(int id,
unsigned long *start,
unsigned long *end,
int *numa_id);
void remote_flush_tlb_cpumask(struct process_vm *vm,
unsigned long addr, int cpu_id);
void remote_flush_tlb_array_cpumask(struct process_vm *vm,
unsigned long *addr,
int nr_addr,
int cpu_id);
int ihk_set_kmsg(unsigned long addr, unsigned long size);
char *ihk_get_kargs();
int ihk_set_monitor(unsigned long addr, unsigned long size);
int ihk_set_nmi_mode_addr(unsigned long addr);
extern void (*__tlb_flush_handler)(int vector);
struct tlb_flush_entry {
struct process_vm *vm;
unsigned long addr;
unsigned long *addr;
int nr_addr;
ihk_atomic_t pending;
ihk_spinlock_t lock;
} __attribute__((aligned(64)));

View File

@@ -5,23 +5,69 @@
* Declare functions acquire physical pages and assign virtual addresses
* to them.
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Balazs Gerofi <bgerofi@riken.jp> \par
*/
/*
* HISTORY
* 2016/12 - bgerofi - NUMA support
* 2017/06 - bgerofi - rewrite physical memory mngt for red-black trees
*/
#ifndef __HEADER_GENERIC_IHK_PAGE_ALLOC
#define __HEADER_GENERIC_IHK_PAGE_ALLOC
#include <list.h>
#include <rbtree.h>
/* XXX: Physical memory management shouldn't be part of IHK */
struct node_distance {
int id;
int distance;
};
#define IHK_RBTREE_ALLOCATOR
#ifdef IHK_RBTREE_ALLOCATOR
struct free_chunk {
unsigned long addr, size;
struct rb_node node;
};
#endif
struct ihk_mc_numa_node {
int id;
int linux_numa_id;
int type;
struct list_head allocators;
struct node_distance *nodes_by_distance;
#ifdef IHK_RBTREE_ALLOCATOR
struct rb_root free_chunks;
mcs_lock_node_t lock;
unsigned long nr_pages;
unsigned long nr_free_pages;
unsigned long min_addr;
unsigned long max_addr;
#endif
};
#ifdef IHK_RBTREE_ALLOCATOR
unsigned long ihk_numa_alloc_pages(struct ihk_mc_numa_node *node,
int npages, int p2align);
void ihk_numa_free_pages(struct ihk_mc_numa_node *node,
unsigned long addr, int npages);
int ihk_numa_add_free_pages(struct ihk_mc_numa_node *node,
unsigned long addr, unsigned long size);
#endif
struct ihk_page_allocator_desc {
unsigned long start;
unsigned long start, end;
unsigned int last;
unsigned int count;
unsigned int flag;
unsigned int shift;
ihk_spinlock_t lock;
unsigned int pad;
mcs_lock_node_t lock;
struct list_head list;
unsigned long map[0];
};

102
lib/include/ihk/rusage.h Normal file
View File

@@ -0,0 +1,102 @@
#ifndef __IHK_RUSAGE_H
#define __IHK_RUSAGE_H
struct ihk_os_cpu_monitor {
int status;
#define IHK_OS_MONITOR_NOT_BOOT 0
#define IHK_OS_MONITOR_IDLE 1
#define IHK_OS_MONITOR_USER 2
#define IHK_OS_MONITOR_KERNEL 3
#define IHK_OS_MONITOR_KERNEL_HEAVY 4
#define IHK_OS_MONITOR_KERNEL_OFFLOAD 5
#define IHK_OS_MONITOR_KERNEL_FREEZING 8
#define IHK_OS_MONITOR_KERNEL_FROZEN 9
#define IHK_OS_MONITOR_KERNEL_THAW 10
#define IHK_OS_MONITOR_PANIC 99
int status_bak;
unsigned long counter;
unsigned long ocounter;
unsigned long user_tsc;
unsigned long system_tsc;
};
struct ihk_os_monitor {
unsigned long rusage_max_num_threads;
unsigned long rusage_num_threads;
unsigned long rusage_rss_max;
long rusage_rss_current;
unsigned long rusage_kmem_usage;
unsigned long rusage_kmem_max_usage;
unsigned long rusage_hugetlb_usage;
unsigned long rusage_hugetlb_max_usage;
unsigned long rusage_total_memory;
unsigned long rusage_total_memory_usage;
unsigned long rusage_total_memory_max_usage;
unsigned long num_numa_nodes;
unsigned long num_processors;
unsigned long ns_per_tsc;
unsigned long reserve[128];
unsigned long rusage_numa_stat[1024];
struct ihk_os_cpu_monitor cpu[0];
};
enum RUSAGE_MEMBER {
RUSAGE_RSS,
RUSAGE_CACHE,
RUSAGE_RSS_HUGE,
RUSAGE_MAPPED_FILE,
RUSAGE_MAX_USAGE,
RUSAGE_KMEM_USAGE,
RUSAGE_KMAX_USAGE,
RUSAGE_NUM_NUMA_NODES,
RUSAGE_NUMA_STAT,
RUSAGE_HUGETLB ,
RUSAGE_HUGETLB_MAX ,
RUSAGE_STAT_SYSTEM ,
RUSAGE_STAT_USER ,
RUSAGE_USAGE ,
RUSAGE_USAGE_PER_CPU ,
RUSAGE_NUM_THREADS ,
RUSAGE_MAX_NUM_THREADS
};
struct r_data{
unsigned long pid;
unsigned long rss;
unsigned long cache;
unsigned long rss_huge;
unsigned long mapped_file;
unsigned long max_usage;
unsigned long kmem_usage;
unsigned long kmax_usage;
unsigned long hugetlb;
unsigned long hugetlb_max;
unsigned long stat_system;
unsigned long stat_user;
unsigned long usage;
struct r_data *next;
} ;
enum ihk_os_status {
IHK_STATUS_INACTIVE,
IHK_STATUS_BOOTING,
IHK_STATUS_RUNNING,
IHK_STATUS_SHUTDOWN,
IHK_STATUS_PANIC,
IHK_STATUS_HUNGUP,
IHK_STATUS_FREEZING,
IHK_STATUS_FROZEN,
};
enum sys_delegate_state_enum {
ENTER_KERNEL,
EXIT_KERNEL,
};
extern struct ihk_os_monitor *monitor;
extern void ihk_mc_set_os_status(unsigned long st);
extern unsigned long ihk_mc_get_os_status();
#endif

View File

@@ -17,4 +17,8 @@
#define INT_MIN -0x80000000
#define IOV_MAX 1024
#ifndef PATH_MAX
#define PATH_MAX 4096
#endif
#endif

Some files were not shown because too many files have changed in this diff Show More