Compare commits
265 Commits
fj_test_20
...
1.7.0-0.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
232bc9c44b | ||
|
|
f34373d1c0 | ||
|
|
4698ae166c | ||
|
|
db9ca358f9 | ||
|
|
16a6a1d08b | ||
|
|
2e2e973d78 | ||
|
|
c3c0b7197f | ||
|
|
d086100b35 | ||
|
|
8f74888f87 | ||
|
|
8e42c2a254 | ||
|
|
caf0f5ef63 | ||
|
|
3d030391e8 | ||
|
|
0aeab6b840 | ||
|
|
367bbda713 | ||
|
|
0082447043 | ||
|
|
4f50c90f6e | ||
|
|
79950e045e | ||
|
|
6cf7cebb2d | ||
|
|
c9f05f238d | ||
|
|
f1caaa9b74 | ||
|
|
97cd379ee2 | ||
|
|
8ee1d61d0f | ||
|
|
04d17dd3e9 | ||
|
|
33eef71133 | ||
|
|
c10b4a1c16 | ||
|
|
8cf70900e7 | ||
|
|
b2618a98f5 | ||
|
|
01d06cb218 | ||
|
|
c78803ac08 | ||
|
|
3300e65efc | ||
|
|
d82ac31bc6 | ||
|
|
4946fbdd82 | ||
|
|
33cba1ad48 | ||
|
|
7c69cfaf67 | ||
|
|
b3cbdeec84 | ||
|
|
1d1ec39a27 | ||
|
|
0a4e6b49b4 | ||
|
|
bb7e140655 | ||
|
|
32b32f0c4a | ||
|
|
bf7fd81c1b | ||
|
|
92d191de9e | ||
|
|
baf68f7e71 | ||
|
|
26bebb2749 | ||
|
|
9e2196c9ce | ||
|
|
93581cb142 | ||
|
|
67f5a1d4e0 | ||
|
|
edf7b36669 | ||
|
|
1a204b6674 | ||
|
|
305511b48f | ||
|
|
606db376fd | ||
|
|
5719b4c64a | ||
|
|
343121c3d0 | ||
|
|
86c45484e3 | ||
|
|
767792808a | ||
|
|
117f070fd6 | ||
|
|
a27909be88 | ||
|
|
cec6f24559 | ||
|
|
b3b8283f87 | ||
|
|
d62f80a7c0 | ||
|
|
6d584feaef | ||
|
|
e2e015e120 | ||
|
|
5fb3abe87b | ||
|
|
37fd9e0cd2 | ||
|
|
7e748b4ecb | ||
|
|
cafb46efc7 | ||
|
|
41ea9d16c4 | ||
|
|
4bbdee395e | ||
|
|
597baf8445 | ||
|
|
55faba77a5 | ||
|
|
6bef773741 | ||
|
|
7882110e9f | ||
|
|
d1df17ffb7 | ||
|
|
72af689e69 | ||
|
|
153d0609de | ||
|
|
83bbb87a0f | ||
|
|
f00d03445c | ||
|
|
911b07f507 | ||
|
|
5b26fe2956 | ||
|
|
1db00ebc04 | ||
|
|
d5de68e97b | ||
|
|
1526237bc6 | ||
|
|
b8d96a74ce | ||
|
|
3c256e1a6c | ||
|
|
7fc4272b89 | ||
|
|
d052acab1d | ||
|
|
91ea69cf8f | ||
|
|
0c63a2a3cd | ||
|
|
a8696d811d | ||
|
|
569dc33a9c | ||
|
|
4b252a990f | ||
|
|
adb6cce3ce | ||
|
|
ed21b6849d | ||
|
|
37605740a4 | ||
|
|
e069694c12 | ||
|
|
dca1cb2625 | ||
|
|
caac060684 | ||
|
|
d330721421 | ||
|
|
157eeca41a | ||
|
|
8ba725b225 | ||
|
|
a563d780c1 | ||
|
|
621533bbd3 | ||
|
|
37ea770f8c | ||
|
|
edd3ea0103 | ||
|
|
41d37bcd30 | ||
|
|
309145587f | ||
|
|
bc06d68d84 | ||
|
|
18412616e1 | ||
|
|
c371fbf13b | ||
|
|
1492f16d67 | ||
|
|
fd38ab6fd0 | ||
|
|
f115bae8a7 | ||
|
|
ba80dd8650 | ||
|
|
06960a41d9 | ||
|
|
86a2aabb24 | ||
|
|
b4101d9c36 | ||
|
|
ec31d72483 | ||
|
|
83ade5cdcd | ||
|
|
dec133c1dd | ||
|
|
04a528ab27 | ||
|
|
8e4073c2ca | ||
|
|
ff982b8594 | ||
|
|
299d47abf5 | ||
|
|
f2460695c4 | ||
|
|
6ce5c754f3 | ||
|
|
e932f2e70c | ||
|
|
bb08742467 | ||
|
|
3e9fdfc0f1 | ||
|
|
58f4593478 | ||
|
|
de0e07f29e | ||
|
|
a4b83dc6d4 | ||
|
|
beac6c3e80 | ||
|
|
5d6715078f | ||
|
|
0615a0b00b | ||
|
|
51cd7cbb6c | ||
|
|
0c1cae45fe | ||
|
|
11ef2f8092 | ||
|
|
12aef0b578 | ||
|
|
9b3450ee7e | ||
|
|
0d3ef65092 | ||
|
|
258156b57e | ||
|
|
8efced7bf7 | ||
|
|
2dd8687974 | ||
|
|
f0bc1a6b07 | ||
|
|
c52370b959 | ||
|
|
9c78d4d249 | ||
|
|
b6285c9aa9 | ||
|
|
b945367c90 | ||
|
|
0f434288e1 | ||
|
|
b5cd813229 | ||
|
|
7268942c35 | ||
|
|
f8cad24a9a | ||
|
|
2b6b3f31e5 | ||
|
|
ca19ee434a | ||
|
|
bb2589bac4 | ||
|
|
e1c6e17400 | ||
|
|
207eba93ea | ||
|
|
06af2d62c6 | ||
|
|
3e267e24cb | ||
|
|
e58e1c6e33 | ||
|
|
fb924ebb9d | ||
|
|
ac61577414 | ||
|
|
4cee9b1a27 | ||
|
|
b55e164669 | ||
|
|
aa66fe2cb1 | ||
|
|
3b74b0a093 | ||
|
|
0267a0c8ea | ||
|
|
b3b7801d51 | ||
|
|
10f1fe76db | ||
|
|
089b443aaf | ||
|
|
e9955a4bba | ||
|
|
dc52c8a11a | ||
|
|
bc4629dfb0 | ||
|
|
99fba2df1c | ||
|
|
239c95449b | ||
|
|
9dfc139eae | ||
|
|
bc81d362b4 | ||
|
|
90b6aec53d | ||
|
|
0887e0de6d | ||
|
|
2c5c47344d | ||
|
|
b9f223ceca | ||
|
|
6297181dcd | ||
|
|
80f964e44f | ||
|
|
cc07d6e017 | ||
|
|
07c517828d | ||
|
|
75e42badf4 | ||
|
|
bdccbf7356 | ||
|
|
ad3ee26d36 | ||
|
|
16f8ccb35b | ||
|
|
3fda54ece8 | ||
|
|
4d252c2bb2 | ||
|
|
0cf89c5682 | ||
|
|
0d902872a1 | ||
|
|
9b6a88eeeb | ||
|
|
96b4729cd5 | ||
|
|
3372bbfd23 | ||
|
|
f17c30da07 | ||
|
|
9a0eb915fb | ||
|
|
a5ded1fc06 | ||
|
|
de042b2cb2 | ||
|
|
2cee82673b | ||
|
|
dfb3bef96d | ||
|
|
2dc51530f3 | ||
|
|
13758417c5 | ||
|
|
c32edff2bb | ||
|
|
8356ef6c96 | ||
|
|
63d500515a | ||
|
|
791e8c2114 | ||
|
|
0bb612caea | ||
|
|
5e992bc195 | ||
|
|
08f817a654 | ||
|
|
b87ac8b8c0 | ||
|
|
a48a2cd3e8 | ||
|
|
7c238c27c9 | ||
|
|
de77d2b061 | ||
|
|
52f89cf8fa | ||
|
|
c96dfb0c68 | ||
|
|
21c9e57646 | ||
|
|
312b6c171b | ||
|
|
2ce695b47b | ||
|
|
e5c1fdf129 | ||
|
|
9e3dd53c58 | ||
|
|
fe53c6e0a5 | ||
|
|
e988bfaf50 | ||
|
|
f6f48b1210 | ||
|
|
70b42fde5d | ||
|
|
ccb36a5849 | ||
|
|
ea7f517e3d | ||
|
|
ac18a24a27 | ||
|
|
8880710fad | ||
|
|
03a85825ed | ||
|
|
940eeca6f5 | ||
|
|
19b02cf4ed | ||
|
|
76a0cc71fc | ||
|
|
ab39798181 | ||
|
|
0cc3496747 | ||
|
|
10cca81401 | ||
|
|
0c79de67b4 | ||
|
|
3fbad79afb | ||
|
|
1b76aaa7e1 | ||
|
|
aa3c5e91db | ||
|
|
20d5900c35 | ||
|
|
414cffd95b | ||
|
|
9ec0aeeab5 | ||
|
|
06e96005a6 | ||
|
|
4606714c07 | ||
|
|
a5d5baf8a8 | ||
|
|
8074445d59 | ||
|
|
6a456f11aa | ||
|
|
81e665cb48 | ||
|
|
e0b9c5deec | ||
|
|
62772c8a24 | ||
|
|
63d15f7dfc | ||
|
|
fb3f1c58a8 | ||
|
|
69846345de | ||
|
|
b8155cc618 | ||
|
|
f07e20a381 | ||
|
|
764948b51f | ||
|
|
7da5fede8b | ||
|
|
6810506c3d | ||
|
|
c82c2c1231 | ||
|
|
5bc54a3bbe | ||
|
|
07aa96ef95 | ||
|
|
dac99f708c | ||
|
|
f3c9fbf4ea | ||
|
|
54122360e8 |
9
.gitignore
vendored
9
.gitignore
vendored
@@ -13,6 +13,10 @@ old_timestamp
|
|||||||
CMakeFiles
|
CMakeFiles
|
||||||
CMakeCache.txt
|
CMakeCache.txt
|
||||||
Makefile
|
Makefile
|
||||||
|
!test/*/*/Makefile
|
||||||
|
!test/signalonfork+wait/Makefile
|
||||||
|
!test/perf_overflow/Makefile
|
||||||
|
!test/*/*/*.cmd
|
||||||
Kbuild
|
Kbuild
|
||||||
cmake_install.cmake
|
cmake_install.cmake
|
||||||
config.h
|
config.h
|
||||||
@@ -33,3 +37,8 @@ executer/user/libmcexec.a
|
|||||||
executer/user/libldump2mcdump.so
|
executer/user/libldump2mcdump.so
|
||||||
executer/user/eclair
|
executer/user/eclair
|
||||||
tools/mcstat/mcstat
|
tools/mcstat/mcstat
|
||||||
|
/_CPack_Packages
|
||||||
|
/CPackSourceConfig.cmake
|
||||||
|
CPackConfig.cmake
|
||||||
|
/build
|
||||||
|
mckernel-*.tar.gz
|
||||||
|
|||||||
150
CMakeLists.txt
150
CMakeLists.txt
@@ -7,59 +7,116 @@ endif (NOT CMAKE_BUILD_TYPE)
|
|||||||
enable_language(C ASM)
|
enable_language(C ASM)
|
||||||
|
|
||||||
project(mckernel C ASM)
|
project(mckernel C ASM)
|
||||||
set(MCKERNEL_VERSION "1.6.0")
|
set(MCKERNEL_VERSION "1.7.0")
|
||||||
|
|
||||||
|
# See "Fedora Packaging Guidlines -- Versioning"
|
||||||
|
set(MCKERNEL_RELEASE "0.2")
|
||||||
|
|
||||||
set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/modules)
|
set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/modules)
|
||||||
# for rpmbuild
|
# for rpmbuild
|
||||||
if(DEFINED SYSCONF_INSTALL_DIR)
|
if(DEFINED SYSCONF_INSTALL_DIR)
|
||||||
set(CMAKE_INSTALL_SYSCONFDIR "${SYSCONF_INSTALL_DIR}")
|
set(CMAKE_INSTALL_SYSCONFDIR "${SYSCONF_INSTALL_DIR}")
|
||||||
endif()
|
endif()
|
||||||
include(GNUInstallDirs)
|
|
||||||
include(CMakeParseArguments)
|
|
||||||
include(Kbuild)
|
|
||||||
include(Ksym)
|
|
||||||
include(CheckCCompilerFlag)
|
|
||||||
|
|
||||||
set(CFLAGS_WARNINGS "-Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Wno-unused-function")
|
|
||||||
CHECK_C_COMPILER_FLAG(-Wno-implicit-fallthrough IMPLICIT_FALLTHROUGH)
|
|
||||||
if(IMPLICIT_FALLTHROUGH)
|
|
||||||
set(CFLAGS_WARNINGS "${CFLAGS_WARNINGS} -Wno-implicit-fallthrough")
|
|
||||||
endif(IMPLICIT_FALLTHROUGH)
|
|
||||||
|
|
||||||
# C flags need to be set before enabling language?
|
|
||||||
set(CMAKE_C_FLAGS_DEBUG "-g ${CFLAGS_WARNINGS}" CACHE STRING "Debug compiler flags")
|
|
||||||
set(CMAKE_C_FLAGS_RELEASE "${CFLAGS_WARNINGS}" CACHE STRING "Release compiler flags")
|
|
||||||
|
|
||||||
# build options
|
|
||||||
option(ENABLE_WERROR "Enable -Werror" OFF)
|
|
||||||
if (ENABLE_WERROR)
|
|
||||||
add_compile_options("-Werror")
|
|
||||||
endif(ENABLE_WERROR)
|
|
||||||
|
|
||||||
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
|
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
|
||||||
set(BUILD_TARGET "smp-x86" CACHE STRING "Build target: smp-x86 | smp-arm64")
|
set(BUILD_TARGET "smp-x86" CACHE STRING "Build target: smp-x86 | smp-arm64")
|
||||||
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
|
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
|
||||||
set(BUILD_TARGET "smp-arm64" CACHE STRING "Build target: smp-x86 | smp-arm64")
|
set(BUILD_TARGET "smp-arm64" CACHE STRING "Build target: smp-x86 | smp-arm64")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (BUILD_TARGET STREQUAL "smp-x86")
|
if (BUILD_TARGET STREQUAL "smp-x86")
|
||||||
set(ARCH "x86_64")
|
set(ARCH "x86_64")
|
||||||
elseif (BUILD_TARGET STREQUAL "smp-arm64")
|
elseif (BUILD_TARGET STREQUAL "smp-arm64")
|
||||||
set(ARCH "arm64")
|
set(ARCH "arm64")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
include(GNUInstallDirs)
|
||||||
|
include(CMakeParseArguments)
|
||||||
|
include(Kbuild)
|
||||||
|
include(CheckCCompilerFlag)
|
||||||
|
|
||||||
|
CHECK_C_COMPILER_FLAG(-Wno-implicit-fallthrough IMPLICIT_FALLTHROUGH)
|
||||||
|
if(IMPLICIT_FALLTHROUGH)
|
||||||
|
set(EXTRA_WARNINGS "-Wno-implicit-fallthrough")
|
||||||
|
endif(IMPLICIT_FALLTHROUGH)
|
||||||
|
|
||||||
|
# build options
|
||||||
|
set(CFLAGS_WARNING "-Wall" "-Wextra" "-Wno-unused-parameter" "-Wno-sign-compare" "-Wno-unused-function" ${EXTRA_WARNINGS} CACHE STRING "Warning flags")
|
||||||
|
add_compile_options(${CFLAGS_WARNING})
|
||||||
|
|
||||||
|
option(ENABLE_WERROR "Enable -Werror" OFF)
|
||||||
|
if (ENABLE_WERROR)
|
||||||
|
add_compile_options("-Werror")
|
||||||
|
endif(ENABLE_WERROR)
|
||||||
|
|
||||||
|
option(ENABLE_LINUX_WORK_IRQ_FOR_IKC "Use Linux work IRQ for IKC IPI" ON)
|
||||||
|
if (ENABLE_LINUX_WORK_IRQ_FOR_IKC)
|
||||||
|
set(KBUILD_C_FLAGS "${KBUILD_C_FLAGS} -DIHK_IKC_USE_LINUX_WORK_IRQ")
|
||||||
|
add_definitions(-DIHK_IKC_USE_LINUX_WORK_IRQ)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (BUILD_TARGET STREQUAL "smp-arm64")
|
||||||
foreach(i RANGE 1 120)
|
foreach(i RANGE 1 120)
|
||||||
add_definitions(-DPOSTK_DEBUG_ARCH_DEP_${i} -DPOSTK_DEBUG_TEMP_FIX_${i})
|
add_definitions(-DPOSTK_DEBUG_ARCH_DEP_${i} -DPOSTK_DEBUG_TEMP_FIX_${i})
|
||||||
set(KBUILD_C_FLAGS "${KBUILD_C_FLAGS} -DPOSTK_DEBUG_ARCH_DEP_${i} -DPOSTK_DEBUG_TEMP_FIX_${i}")
|
set(KBUILD_C_FLAGS "${KBUILD_C_FLAGS} -DPOSTK_DEBUG_ARCH_DEP_${i} -DPOSTK_DEBUG_TEMP_FIX_${i}")
|
||||||
endforeach()
|
endforeach()
|
||||||
add_definitions(-DCONFIG_ARM64_64K_PAGES -DCONFIG_ARM64_VA_BITS=48)
|
|
||||||
|
execute_process(COMMAND awk -F= "$1 == \"CONFIG_ARM64_64K_PAGES\" { print $2; exit; }" "${KERNEL_DIR}/.config"
|
||||||
|
OUTPUT_VARIABLE CONFIG_ARM64_64K_PAGES OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||||
|
execute_process(COMMAND awk -F= "$1 == \"CONFIG_ARM64_VA_BITS\" { print $2; exit; }" "${KERNEL_DIR}/.config"
|
||||||
|
OUTPUT_VARIABLE CONFIG_ARM64_VA_BITS OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||||
|
|
||||||
|
message("Host kernel CONFIG_ARM64_64K_PAGES=${CONFIG_ARM64_64K_PAGES}")
|
||||||
|
message("Host kernel CONFIG_ARM64_VA_BITS=${CONFIG_ARM64_VA_BITS}")
|
||||||
|
|
||||||
|
if(CONFIG_ARM64_64K_PAGES STREQUAL "y")
|
||||||
|
if(CONFIG_ARM64_VA_BITS STREQUAL 42)
|
||||||
|
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=2 -DCONFIG_ARM64_VA_BITS=42 -DCONFIG_ARM64_64K_PAGES)
|
||||||
|
set(LINKER_SCRIPT "smp-arm64_type3.lds")
|
||||||
|
elseif(CONFIG_ARM64_VA_BITS STREQUAL 48)
|
||||||
|
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=3 -DCONFIG_ARM64_VA_BITS=48 -DCONFIG_ARM64_64K_PAGES)
|
||||||
|
set(LINKER_SCRIPT "smp-arm64_type4.lds")
|
||||||
|
endif()
|
||||||
|
else(CONFIG_ARM64_64K_PAGES STREQUAL "y")
|
||||||
|
if(CONFIG_ARM64_VA_BITS STREQUAL 39)
|
||||||
|
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=3 -DCONFIG_ARM64_VA_BITS=39)
|
||||||
|
set(LINKER_SCRIPT "smp-arm64_type1.lds")
|
||||||
|
elseif(CONFIG_ARM64_VA_BITS STREQUAL 48)
|
||||||
|
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=4 -DCONFIG_ARM64_VA_BITS=48)
|
||||||
|
set(LINKER_SCRIPT "smp-arm64_type2.lds")
|
||||||
|
endif()
|
||||||
|
endif(CONFIG_ARM64_64K_PAGES STREQUAL "y")
|
||||||
endif()
|
endif()
|
||||||
set_property(CACHE BUILD_TARGET PROPERTY STRINGS smp-x86 smp-arm64)
|
set_property(CACHE BUILD_TARGET PROPERTY STRINGS smp-x86 smp-arm64)
|
||||||
|
|
||||||
|
# define MAP_KERNEL_START
|
||||||
|
|
||||||
|
set(tmpdir ${CMAKE_CURRENT_BINARY_DIR}/tmp.resolve_MODULES_END)
|
||||||
|
file(REMOVE_RECURSE ${tmpdir})
|
||||||
|
file(MAKE_DIRECTORY ${tmpdir})
|
||||||
|
file(WRITE ${tmpdir}/driver.c "#include <linux/module.h>\n")
|
||||||
|
file(APPEND ${tmpdir}/driver.c "unsigned long MAP_KERNEL_START = MODULES_END - (1UL << 23);\n")
|
||||||
|
file(WRITE ${tmpdir}/Makefile "obj-m := driver.o\n")
|
||||||
|
file(APPEND ${tmpdir}/Makefile "all:\n")
|
||||||
|
file(APPEND ${tmpdir}/Makefile "\tmake ${KBUILD_MAKE_FLAGS_STR} -C ${KERNEL_DIR} M=${tmpdir} modules\n")
|
||||||
|
|
||||||
|
execute_process(COMMAND make -C ${tmpdir})
|
||||||
|
execute_process(COMMAND bash -c "offset=`readelf -S ${tmpdir}/driver.ko | grep .data | sed 's/.* //g'`; echo $((0x$offset))"
|
||||||
|
OUTPUT_VARIABLE MAP_KERNEL_START_OFFSET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||||
|
execute_process(COMMAND bash -c "dd if=${tmpdir}/driver.ko bs=1 skip=${MAP_KERNEL_START_OFFSET} count=8 2>/dev/null | od -tx8 -Ax | head -1 | sed 's|.* |0x|g'"
|
||||||
|
OUTPUT_VARIABLE MAP_KERNEL_START OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||||
|
|
||||||
|
|
||||||
set(ENABLE_MEMDUMP ON)
|
set(ENABLE_MEMDUMP ON)
|
||||||
option(ENABLE_PERF "Enable perf support" ON)
|
option(ENABLE_PERF "Enable perf support" ON)
|
||||||
option(ENABLE_RUSAGE "Enable rusage support" ON)
|
option(ENABLE_RUSAGE "Enable rusage support" ON)
|
||||||
option(ENABLE_MCOVERLAYFS "Enable overlay filesystem" OFF)
|
|
||||||
option(ENABLE_QLMPI "Enable qlmpi programs" OFF)
|
option(ENABLE_QLMPI "Enable qlmpi programs" OFF)
|
||||||
option(ENABLE_UTI "Enable uti support" OFF)
|
option(ENABLE_UTI "Enable uti support" OFF)
|
||||||
option(ENABLE_UBSAN "Enable undefined behaviour sanitizer on mckernel size" OFF)
|
option(ENABLE_UBSAN "Enable undefined behaviour sanitizer on mckernel size" OFF)
|
||||||
|
option(ENABLE_PER_CPU_ALLOC_CACHE "Enable per-CPU allocator cache (ThunderX2 workaround)" OFF)
|
||||||
|
|
||||||
|
find_package(PkgConfig REQUIRED)
|
||||||
|
set(PKG_CONFIG_USE_CMAKE_PREFIX_PATH ON)
|
||||||
|
|
||||||
find_library(LIBRT rt)
|
find_library(LIBRT rt)
|
||||||
find_library(LIBNUMA numa)
|
find_library(LIBNUMA numa)
|
||||||
@@ -71,7 +128,8 @@ if (ENABLE_QLMPI)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (ENABLE_UTI)
|
if (ENABLE_UTI)
|
||||||
find_library(LIBSYSCALL_INTERCEPT syscall_intercept)
|
pkg_check_modules(LIBSYSCALL_INTERCEPT REQUIRED libsyscall_intercept)
|
||||||
|
link_directories(${LIBSYSCALL_INTERCEPT_LIBRARY_DIRS})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(-([0-9]+)(.*))?" "\\1;\\2;\\3;\\5;\\6" LINUX_VERSION ${UNAME_R})
|
string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(-([0-9]+)(.*))?" "\\1;\\2;\\3;\\5;\\6" LINUX_VERSION ${UNAME_R})
|
||||||
@@ -81,29 +139,11 @@ list(GET LINUX_VERSION 2 LINUX_VERSION_PATCH)
|
|||||||
list(GET LINUX_VERSION 3 LINUX_VERSION_RELEASE)
|
list(GET LINUX_VERSION 3 LINUX_VERSION_RELEASE)
|
||||||
math(EXPR LINUX_VERSION_CODE "${LINUX_VERSION_MAJOR} * 65536 + ${LINUX_VERSION_MINOR} * 256 + ${LINUX_VERSION_PATCH}")
|
math(EXPR LINUX_VERSION_CODE "${LINUX_VERSION_MAJOR} * 65536 + ${LINUX_VERSION_MINOR} * 256 + ${LINUX_VERSION_PATCH}")
|
||||||
|
|
||||||
ksym(sys_mount PREFIX MCCTRL_)
|
|
||||||
ksym(sys_umount PREFIX MCCTRL_)
|
|
||||||
ksym(sys_unshare PREFIX MCCTRL_)
|
|
||||||
ksym(zap_page_range PREFIX MCCTRL_)
|
|
||||||
ksym(vdso_image_64 PREFIX MCCTRL_)
|
|
||||||
ksym(vdso_start PREFIX MCCTRL_)
|
|
||||||
ksym(vdso_end PREFIX MCCTRL_)
|
|
||||||
ksym(vdso_pages PREFIX MCCTRL_)
|
|
||||||
ksym(__vvar_page PREFIX MCCTRL_)
|
|
||||||
ksym(hpet_address PREFIX MCCTRL_)
|
|
||||||
# POSTK_DEBUG_ARCH_DEP_50, add:find kernel symbol.
|
|
||||||
ksym(vdso_spec PREFIX MCCTRL_)
|
|
||||||
ksym(hv_clock PREFIX MCCTRL_)
|
|
||||||
ksym(sys_readlink PREFIX MCCTRL_)
|
|
||||||
ksym(walk_page_range PREFIX MCCTRL_)
|
|
||||||
|
|
||||||
|
|
||||||
# compat with various install paths
|
# compat with various install paths
|
||||||
set(MCKERNEL_LIBDIR ${CMAKE_INSTALL_FULL_LIBDIR})
|
|
||||||
set(BINDIR ${CMAKE_INSTALL_FULL_BINDIR})
|
set(BINDIR ${CMAKE_INSTALL_FULL_BINDIR})
|
||||||
set(SBINDIR ${CMAKE_INSTALL_FULL_SBINDIR})
|
set(SBINDIR ${CMAKE_INSTALL_FULL_SBINDIR})
|
||||||
set(ETCDIR ${CMAKE_INSTALL_FULL_SYSCONFDIR})
|
set(ETCDIR ${CMAKE_INSTALL_PREFIX}/etc)
|
||||||
set(ROOTFSDIR "${CMAKE_INSTALL_PREFIX}/rootfs")
|
set(ROOTFSDIR "/rootfs")
|
||||||
if (CMAKE_INSTALL_PREFIX STREQUAL "/usr")
|
if (CMAKE_INSTALL_PREFIX STREQUAL "/usr")
|
||||||
set(KMODDIR "/lib/modules/${UNAME_R}/extra/mckernel")
|
set(KMODDIR "/lib/modules/${UNAME_R}/extra/mckernel")
|
||||||
set(MCKERNELDIR "${CMAKE_INSTALL_FULL_DATADIR}/mckernel/${BUILD_TARGET}")
|
set(MCKERNELDIR "${CMAKE_INSTALL_FULL_DATADIR}/mckernel/${BUILD_TARGET}")
|
||||||
@@ -138,23 +178,19 @@ configure_file(config.h.in config.h)
|
|||||||
|
|
||||||
# actual build section - just subdirs
|
# actual build section - just subdirs
|
||||||
add_subdirectory(executer/kernel/mcctrl)
|
add_subdirectory(executer/kernel/mcctrl)
|
||||||
if (ENABLE_MCOVERLAYFS)
|
|
||||||
add_subdirectory(executer/kernel/mcoverlayfs)
|
|
||||||
endif()
|
|
||||||
add_subdirectory(executer/user)
|
add_subdirectory(executer/user)
|
||||||
add_subdirectory(kernel)
|
add_subdirectory(kernel)
|
||||||
add_subdirectory(tools/mcstat)
|
add_subdirectory(tools/mcstat)
|
||||||
|
add_subdirectory(tools/crash)
|
||||||
|
|
||||||
configure_file(arch/x86_64/tools/mcreboot-smp-x86.sh.in mcreboot.sh @ONLY)
|
configure_file(scripts/mcreboot-smp.sh.in mcreboot.sh @ONLY)
|
||||||
configure_file(arch/x86_64/tools/mcstop+release-smp-x86.sh.in mcstop+release.sh @ONLY)
|
configure_file(scripts/mcstop+release-smp.sh.in mcstop+release.sh @ONLY)
|
||||||
configure_file(arch/x86_64/tools/mcreboot.1in mcreboot.1 @ONLY)
|
configure_file(scripts/mcreboot.1in mcreboot.1 @ONLY)
|
||||||
install(PROGRAMS
|
install(PROGRAMS
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/mcreboot.sh"
|
"${CMAKE_CURRENT_BINARY_DIR}/mcreboot.sh"
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/mcstop+release.sh"
|
"${CMAKE_CURRENT_BINARY_DIR}/mcstop+release.sh"
|
||||||
DESTINATION "${CMAKE_INSTALL_SBINDIR}")
|
DESTINATION "${CMAKE_INSTALL_SBINDIR}")
|
||||||
install(FILES
|
install(FILES "scripts/irqbalance_mck.in"
|
||||||
"arch/x86_64/tools/irqbalance_mck.service"
|
|
||||||
"arch/x86_64/tools/irqbalance_mck.in"
|
|
||||||
DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}")
|
DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}")
|
||||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/mcreboot.1"
|
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/mcreboot.1"
|
||||||
DESTINATION "${CMAKE_INSTALL_MANDIR}/man1")
|
DESTINATION "${CMAKE_INSTALL_MANDIR}/man1")
|
||||||
@@ -162,7 +198,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/mcreboot.1"
|
|||||||
|
|
||||||
configure_file(scripts/mckernel.spec.in scripts/mckernel.spec @ONLY)
|
configure_file(scripts/mckernel.spec.in scripts/mckernel.spec @ONLY)
|
||||||
set(CPACK_SOURCE_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${MCKERNEL_VERSION}")
|
set(CPACK_SOURCE_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${MCKERNEL_VERSION}")
|
||||||
set(CPACK_SOURCE_IGNORE_FILES "/.git$")
|
set(CPACK_SOURCE_IGNORE_FILES "/.git/;/build;/CMakeCache.txt$;/CMakeFiles$;/Makefile$")
|
||||||
set(CPACK_SOURCE_INSTALLED_DIRECTORIES "${CMAKE_SOURCE_DIR};/;${IHK_FULL_SOURCE_DIR};/ihk;${CMAKE_BINARY_DIR}/scripts;/scripts")
|
set(CPACK_SOURCE_INSTALLED_DIRECTORIES "${CMAKE_SOURCE_DIR};/;${IHK_FULL_SOURCE_DIR};/ihk;${CMAKE_BINARY_DIR}/scripts;/scripts")
|
||||||
set(CPACK_SOURCE_GENERATOR "TGZ")
|
set(CPACK_SOURCE_GENERATOR "TGZ")
|
||||||
include(CPack)
|
include(CPack)
|
||||||
@@ -181,12 +217,14 @@ message("KERNEL_DIR: ${KERNEL_DIR}")
|
|||||||
message("SYSTEM_MAP: ${SYSTEM_MAP}")
|
message("SYSTEM_MAP: ${SYSTEM_MAP}")
|
||||||
message("VMLINUX: ${VMLINUX}")
|
message("VMLINUX: ${VMLINUX}")
|
||||||
message("KBUILD_C_FLAGS: ${KBUILD_C_FLAGS}")
|
message("KBUILD_C_FLAGS: ${KBUILD_C_FLAGS}")
|
||||||
|
message("MAP_KERNEL_START: ${MAP_KERNEL_START}")
|
||||||
message("ENABLE_MEMDUMP: ${ENABLE_MEMDUMP}")
|
message("ENABLE_MEMDUMP: ${ENABLE_MEMDUMP}")
|
||||||
message("ENABLE_PERF: ${ENABLE_PERF}")
|
message("ENABLE_PERF: ${ENABLE_PERF}")
|
||||||
message("ENABLE_RUSAGE: ${ENABLE_RUSAGE}")
|
message("ENABLE_RUSAGE: ${ENABLE_RUSAGE}")
|
||||||
message("ENABLE_MCOVERLAYFS: ${ENABLE_MCOVERLAYFS}")
|
|
||||||
message("ENABLE_QLMPI: ${ENABLE_QLMPI}")
|
message("ENABLE_QLMPI: ${ENABLE_QLMPI}")
|
||||||
message("ENABLE_UTI: ${ENABLE_UTI}")
|
message("ENABLE_UTI: ${ENABLE_UTI}")
|
||||||
message("ENABLE_WERROR: ${ENABLE_WERROR}")
|
message("ENABLE_WERROR: ${ENABLE_WERROR}")
|
||||||
message("ENABLE_UBSAN: ${ENABLE_UBSAN}")
|
message("ENABLE_UBSAN: ${ENABLE_UBSAN}")
|
||||||
|
message("ENABLE_LINUX_WORK_IRQ_FOR_IKC: ${ENABLE_LINUX_WORK_IRQ_FOR_IKC}")
|
||||||
|
message("ENABLE_PER_CPU_ALLOC_CACHE: ${ENABLE_PER_CPU_ALLOC_CACHE}")
|
||||||
message("-------------------------------")
|
message("-------------------------------")
|
||||||
|
|||||||
70
KNOWN_BUGS.md
Normal file
70
KNOWN_BUGS.md
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
Linux crash when offlining CPU (el7, hardware-specific)
|
||||||
|
=========================================================
|
||||||
|
|
||||||
|
On some hardware with el7 kernel, linux can crash due to a bug in the
|
||||||
|
irq handling when offlining CPUs (reserve cpu part of mcreboot)
|
||||||
|
|
||||||
|
Example stack trace:
|
||||||
|
```
|
||||||
|
[ 4147.052753] BUG: unable to handle kernel NULL pointer dereference at 0000000000000040
|
||||||
|
[ 4147.060677] IP: [<ffffffff8102ce26>] check_irq_vectors_for_cpu_disable+0x86/0x1c0
|
||||||
|
[ 4147.068226] PGD 1057e44067 PUD 105f1e7067 PMD 0
|
||||||
|
[ 4147.072935] Oops: 0000 [#1] SMP
|
||||||
|
[ 4147.076230] Modules linked in: mcctrl(OE) ihk_smp_x86_64(OE) ihk(OE) xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 tun rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache ip6t_rpfilter ipt_REJECT nf_reject_ipv4 ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute bridge stp llc ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp scsi_tgt ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm mlx4_ib ib_core
|
||||||
|
[ 4147.148619] dm_mirror dm_region_hash dm_log dm_mod sb_edac edac_core intel_powerclamp coretemp ext4 mbcache jbd2 intel_rapl iosf_mbi kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul ipmi_ssif glue_helper ablk_helper joydev iTCO_wdt iTCO_vendor_support cryptd ipmi_si ipmi_devintf ipmi_msghandler pcspkr wmi mei_me mei lpc_ich i2c_i801 sg ioatdma shpchp nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c mlx4_en sd_mod crc_t10dif crct10dif_generic mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm isci igb drm mlx4_core libsas ahci libahci scsi_transport_sas libata crct10dif_pclmul ptp crct10dif_common pps_core crc32c_intel dca i2c_algo_bit i2c_core devlink [last unloaded: ihk]
|
||||||
|
[ 4147.215370] CPU: 6 PID: 38 Comm: migration/6 Tainted: G OE ------------ T 3.10.0-693.2.2.el7.x86_64 #1
|
||||||
|
[ 4147.225672] Hardware name: SGI.COM C1104G-RP5/X9DRG-HF, BIOS 3.0 10/25/2013
|
||||||
|
[ 4147.232747] task: ffff880174689fa0 ti: ffff8801746ac000 task.ti: ffff8801746ac000
|
||||||
|
[ 4147.240278] RIP: 0010:[<ffffffff8102ce26>] [<ffffffff8102ce26>] check_irq_vectors_for_cpu_disable+0x86/0x1c0
|
||||||
|
[ 4147.250275] RSP: 0018:ffff8801746afd30 EFLAGS: 00010046
|
||||||
|
[ 4147.255608] RAX: 0000000000000000 RBX: 000000000000004e RCX: 0000000000000000
|
||||||
|
[ 4147.262770] RDX: 0000000000000020 RSI: 000000000000005f RDI: 0000000000000023
|
||||||
|
[ 4147.269936] RBP: ffff8801746afd58 R08: 0000000000000001 R09: ffff88017f800490
|
||||||
|
[ 4147.277103] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000006
|
||||||
|
[ 4147.284269] R13: 0000000000000000 R14: ffff88085ca82500 R15: 000000000000005f
|
||||||
|
[ 4147.291429] FS: 0000000000000000(0000) GS:ffff88085fb80000(0000) knlGS:0000000000000000
|
||||||
|
[ 4147.299556] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
|
||||||
|
[ 4147.305326] CR2: 0000000000000040 CR3: 0000001059704000 CR4: 00000000001407e0
|
||||||
|
[ 4147.312490] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
|
||||||
|
[ 4147.319659] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
|
||||||
|
[ 4147.326827] Stack:
|
||||||
|
[ 4147.328857] ffff8808f43078c8 ffff8808f4307850 0000000000000286 ffff8808f4307701
|
||||||
|
[ 4147.336384] 0000000000000000 ffff8801746afd70 ffffffff81052a82 0000000200000000
|
||||||
|
[ 4147.343915] ffff8801746afd88 ffffffff81693ca3 0000000000000003 ffff8801746afdc0
|
||||||
|
[ 4147.351447] Call Trace:
|
||||||
|
[ 4147.353921] [<ffffffff81052a82>] native_cpu_disable+0x12/0x40
|
||||||
|
[ 4147.359795] [<ffffffff81693ca3>] take_cpu_down+0x13/0x40
|
||||||
|
[ 4147.365236] [<ffffffff81116899>] multi_cpu_stop+0xd9/0x100
|
||||||
|
[ 4147.370850] [<ffffffff811167c0>] ? cpu_stop_should_run+0x50/0x50
|
||||||
|
[ 4147.376983] [<ffffffff81116ab7>] cpu_stopper_thread+0x97/0x150
|
||||||
|
[ 4147.382942] [<ffffffff816a8fad>] ? __schedule+0x39d/0x8b0
|
||||||
|
[ 4147.388461] [<ffffffff810b909f>] smpboot_thread_fn+0x12f/0x180
|
||||||
|
[ 4147.394406] [<ffffffff810b8f70>] ? lg_double_unlock+0x40/0x40
|
||||||
|
[ 4147.400276] [<ffffffff810b098f>] kthread+0xcf/0xe0
|
||||||
|
[ 4147.405182] [<ffffffff810b08c0>] ? insert_kthread_work+0x40/0x40
|
||||||
|
[ 4147.411319] [<ffffffff816b4f58>] ret_from_fork+0x58/0x90
|
||||||
|
[ 4147.418893] [<ffffffff810b08c0>] ? insert_kthread_work+0x40/0x40
|
||||||
|
[ 4147.426524] Code: 81 fb 00 01 00 00 0f 84 8a 00 00 00 89 d8 65 44 8b 3c 85 20 c6 00 00 45 85 ff 78 e1 44 89 ff e8 91 31 10 00 48 63 15 7e 10 af 00 <48> 8b 70 40 48 c7 c7 80 71 cf 81 49 89 c6 48 83 c2 3f 48 c1 fa
|
||||||
|
[ 4147.450352] RIP [<ffffffff8102ce26>] check_irq_vectors_for_cpu_disable+0x86/0x1c0
|
||||||
|
[ 4147.460135] RSP <ffff8801746afd30>
|
||||||
|
[ 4147.465154] CR2: 0000000000000040
|
||||||
|
```
|
||||||
|
|
||||||
|
This bug has been fixed upstream, but redhat will not backport the fixes.
|
||||||
|
You can work around the problem with a kpatch by backporting the three
|
||||||
|
following commits:
|
||||||
|
|
||||||
|
x86: irq: Get correct available vectors for cpu disable
|
||||||
|
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ac2a55395eddccd6e3e39532df9869d61e97b2ee
|
||||||
|
|
||||||
|
x86/irq: Check for valid irq descriptor in check_irq_vectors_for_cpu_disable()
|
||||||
|
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d97eb8966c91f2c9d05f0a22eb89ed5b76d966d1
|
||||||
|
|
||||||
|
x86/irq: Use proper locking in check_irq_vectors_for_cpu_disable()
|
||||||
|
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cbb24dc761d95fe39a7a122bb1b298e9604cae15
|
||||||
|
|
||||||
|
|
||||||
|
Alternatively, since it is related to the irq configuration, it might
|
||||||
|
be possible to mitigate the issue by setting the irq affinities early
|
||||||
|
on and making sure none of the cpus that will be offlined have any irq
|
||||||
|
configured.
|
||||||
534
NEWS.md
Normal file
534
NEWS.md
Normal file
@@ -0,0 +1,534 @@
|
|||||||
|
=============================================
|
||||||
|
What's new in version 1.7.0rc4 (Apr 15, 2020)
|
||||||
|
=============================================
|
||||||
|
|
||||||
|
----------------------
|
||||||
|
McKernel major updates
|
||||||
|
----------------------
|
||||||
|
1. arm64: Contiguous PTE support
|
||||||
|
2. arm64: Scalable Vector Extension (SVE) support
|
||||||
|
3. arm64: PMU overflow interrupt support
|
||||||
|
4. xpmem: Support large page attachment
|
||||||
|
5. arm64 port: Direct access to Mckernel memory from Linux
|
||||||
|
6. arm64 port: utility thread offloading, which spawns thread onto
|
||||||
|
Linux CPU
|
||||||
|
7. eclair: support for live debug
|
||||||
|
8. Crash utility extension
|
||||||
|
9. Replace mcoverlayfs with a soft userspace overlay
|
||||||
|
10. Build system is switched to cmake
|
||||||
|
11. Core dump includes thread information
|
||||||
|
|
||||||
|
------------------------
|
||||||
|
McKernel major bug fixes
|
||||||
|
------------------------
|
||||||
|
1. shmobj: Fix rusage counting for large page
|
||||||
|
2. mcctrl control: task start_time changed to u64 nsec
|
||||||
|
3. mcctrl: add handling for one more level of page tables
|
||||||
|
4. Add kernel argument to turn on/off time sharing
|
||||||
|
5. flatten_string/process env: realign env and clear trailing bits
|
||||||
|
6. madvise: Add MADV_HUGEPAGE support
|
||||||
|
8. mcctrl: remove in-kernel calls to syscalls
|
||||||
|
9. arch_cpu_read_write_register: error return fix.
|
||||||
|
10. set_cputime(): interrupt enable/disable fix.
|
||||||
|
11. set_mempolicy(): Add mode check.
|
||||||
|
12. mbind(): Fix memory_range_lock deadlock.
|
||||||
|
13. ihk_ikc_recv: Record channel to packet for release
|
||||||
|
14. Add set_cputime() kernel to kernel case and mode enum.
|
||||||
|
15. execve: Call preempt_enable() before error-exit
|
||||||
|
16. memory/x86_64: fix linux safe_kernel_map
|
||||||
|
17. do_kill(): fix pids table when nr of threads is larger than num_processors
|
||||||
|
18. shmget: Use transparent huge pages when page size isn't specified
|
||||||
|
19. prctl: Add support for PR_SET_THP_DISABLE and PR_GET_THP_DISABLE
|
||||||
|
20. monitor_init: fix undetected hang on highest numbered core
|
||||||
|
21. init_process_stack: change premapped stack size based on arch
|
||||||
|
22. x86 syscalls: add a bunch of XXat() delegated syscalls
|
||||||
|
23. do_pageout: fix direct kernel-user access
|
||||||
|
24. stack: add hwcap auxval
|
||||||
|
25. perf counters: add arch-specific perf counters
|
||||||
|
26. Added check of nohost to terminate_host().
|
||||||
|
27. kmalloc: Fix address order in free list
|
||||||
|
28. sysfs: use nr_cpu_ids for cpumasks (fixes libnuma parsing error on ARM)
|
||||||
|
29. monitor_init: Use ihk_mc_cpu_info()
|
||||||
|
30. Fix ThunderX2 write-combined PTE flag insanity
|
||||||
|
31. ARM: eliminate zero page mapping (i.e, init_low_area())
|
||||||
|
32. eliminate futex_cmpxchg_enabled check (not used and dereffed a NULL pointer)
|
||||||
|
33. page_table: Fix return value of lookup_pte when ptl4 is blank
|
||||||
|
34. sysfs: add missing symlinks for cpu/node
|
||||||
|
35. Make Linux handler run when mmap to procfs.
|
||||||
|
36. Separate mmap area from program loading (relocation) area
|
||||||
|
37. move rusage into kernel ELF image (avoid dynamic alloc before NUMA init)
|
||||||
|
38. arm: turn off cpu on panic
|
||||||
|
39. page fault handler: protect thread accesses
|
||||||
|
40. Register PPD and release_handler at the same time.
|
||||||
|
41. fix to missing exclusive processing between terminate() and
|
||||||
|
finalize_process().
|
||||||
|
42. perfctr_stop: add flags to no 'disable_intens'
|
||||||
|
43. fileobj, shmobj: free pages in object destructor (as opposed to page_unmap())
|
||||||
|
44. clear_range_l1, clear_range_middle: Fix handling contiguous PTE
|
||||||
|
45. do_mmap: don't pre-populate the whole file when asked for smaller segment
|
||||||
|
46. invalidate_one_page: Support shmobj and contiguous PTE
|
||||||
|
47. ubsan: fix undefined shifts
|
||||||
|
48. x86: disable zero mapping and add a boot pt for ap trampoline
|
||||||
|
49. rusage: Don't count PF_PATCH change
|
||||||
|
50. Fixed time processing.
|
||||||
|
51. copy_user_pte: vmap area not owned by McKernel
|
||||||
|
52. gencore: Zero-clear ELF header and memory range table
|
||||||
|
53. rpm: ignore CMakeCache.txt in dist and relax BuildRequires on cross build
|
||||||
|
54. gencore: Allocate ELF header to heap instead of stack
|
||||||
|
55. nanosleep: add cpu_pause() in spinwait loop
|
||||||
|
56. init_process: add missing initializations to proc struct
|
||||||
|
57. rus_vm_fault: always use a packet on the stack
|
||||||
|
58. process stack: use PAGE_SIZE in aux vector
|
||||||
|
59. copy_user_pte: base memobj copy on range & VR_PRIVATE
|
||||||
|
60. arm64: ptrace: Fix overwriting 1st argument with return value
|
||||||
|
61. page fault: use cow for private device mappings
|
||||||
|
62. reproductible builds: remove most install paths in c code
|
||||||
|
63. page fault: clear writable bit for non-dirtying access to shared ranges
|
||||||
|
64. mcreboot/mcstop+release: support for regular user execution
|
||||||
|
65. irqbalance_mck: replace extra service with service drop-in
|
||||||
|
66. do_mmap: give addr argument a chance even if not MAP_FIXED
|
||||||
|
67. x86: fix xchg() and cmpxchg() macros
|
||||||
|
68. IHK: support for using Linux work IRQ as IKC interrupt (optional)
|
||||||
|
69. MCS: fix ARM64 issue by using smp_XXX() functions (i.e., barrier()s)
|
||||||
|
70. procfs: add number of threads to stat and status
|
||||||
|
71. memory_range_lock: Fix deadlock in procfs/sysfs handler
|
||||||
|
72. flush instruction cache at context switch time if necessary
|
||||||
|
73. arm64: Fix PMU related functions
|
||||||
|
74. page_fault_process_memory_range: Disable COW for VM region with zeroobj
|
||||||
|
75. extend_process_region: Fall back to demand paging when not contiguous
|
||||||
|
76. munmap: fix deadlock with remote pagefault on vm range lock
|
||||||
|
77. procfs: if memory_range_lock fails, process later
|
||||||
|
78. migrate-cpu: Prevent migration target from calling schedule() twice
|
||||||
|
79. sched_request_migrate(): fix race condition between migration req and IRQs
|
||||||
|
80. get_one_cpu_topology: Renumber core_id (physical core id)
|
||||||
|
81. bb7e140 procfs cpuinfo: use sequence number as processor
|
||||||
|
82. set_host_vma(): do NOT read protect Linux VMA
|
||||||
|
|
||||||
|
===========================================
|
||||||
|
What's new in V1.6.0 (Nov 11, 2018)
|
||||||
|
===========================================
|
||||||
|
|
||||||
|
-----------------------------------------------
|
||||||
|
McKernel new features, improvements and changes
|
||||||
|
-----------------------------------------------
|
||||||
|
1. McKernel and Linux share one unified kernel virtual address space.
|
||||||
|
That is, McKernel sections resides in Linux sections spared for
|
||||||
|
modules. In this way, Linux can access the McKernel kernel memory
|
||||||
|
area.
|
||||||
|
2. hugetlbfs support
|
||||||
|
3. IHK is now included as a git submodule
|
||||||
|
4. Debug messages are turned on/off in per souce file basis at run-time.
|
||||||
|
5. It's prohibited for McKernel to access physical memory ranges which
|
||||||
|
Linux didn't give to McKernel.
|
||||||
|
6. UTI (capability to spawn a thread on Linux CPU) improvement:
|
||||||
|
* System calls issued from the thread are hooked by modifying
|
||||||
|
binary in memory.
|
||||||
|
|
||||||
|
---------------------------
|
||||||
|
McKernel bug fixes (digest)
|
||||||
|
---------------------------
|
||||||
|
#<num> below corresponds to the redmine issue number
|
||||||
|
(https://postpeta.pccluster.org/redmine/).
|
||||||
|
|
||||||
|
1. #926: shmget: Hide object with IPC_RMID from shmget
|
||||||
|
2. #1028: init_process: Inherit parent cpu_set
|
||||||
|
3. #995: Fix shebang recorded in argv[0]
|
||||||
|
4. #1024: Fix VMAP virtual address leak
|
||||||
|
5. #1109: init_process_stack: Support "ulimit -s unlimited"
|
||||||
|
6. x86 mem init: do not map identity mapping
|
||||||
|
7. mcexec_wait_syscall: requeue potential request on interrupted wait
|
||||||
|
8. mcctrl_ikc_send_wait: fix interrupt with do_frees == NULL
|
||||||
|
9. pager_req_read: handle short read
|
||||||
|
10. kprintf: only call eventfd() if it is safe to interrupt
|
||||||
|
11. process_procfs_request: Add Pid to /proc/<PID>/status
|
||||||
|
12. terminate: fix oversubscribe hang when waiting for other threads on same CPU to die
|
||||||
|
13. mcexec: Do not close fd returned to mckernel side
|
||||||
|
14. #976: execve: Clear sigaltstack and fp_regs
|
||||||
|
15. #1002: perf_event: Specify counter by bit_mask on start/stop
|
||||||
|
16. #1027: schedule: Don't reschedule immediately when wake up on migrate
|
||||||
|
17. #mcctrl: lookup unexported symbols at runtime
|
||||||
|
18. __sched_wakeup_thread: Notify interrupt_exit() of re-schedule
|
||||||
|
19. futex_wait_queue_me: Spin-sleep when timeout and idle_halt is specified
|
||||||
|
20. #1167: ihk_os_getperfevent,setperfevent: Timeout IKC sent by mcctrl
|
||||||
|
21. devobj: fix object size (POSTK_DEBUG_TEMP_FIX_36)
|
||||||
|
22. mcctrl: remove rus page cache
|
||||||
|
23. #1021: procfs: Support multiple reads of e.g. /proc/*/maps
|
||||||
|
24. #1006: wait: Delay wake-up parent within switch context
|
||||||
|
25. #1164: mem: Check if phys-mem is within the range of McKernel memory
|
||||||
|
26. #1039: page_fault_process_memory_range: Remove ihk_mc_map_virtual for CoW of device map
|
||||||
|
27. partitioned execution: pass process rank to LWK
|
||||||
|
28. process/vm: implement access_ok()
|
||||||
|
29. spinlock: rewrite spinlock to use Linux ticket head/tail format
|
||||||
|
30. #986: Fix deadlock involving mmap_sem and memory_range_lock
|
||||||
|
31. Prevent one CPU from getting chosen by concurrent forks
|
||||||
|
32. #1009: check_signal: system call restart is done only once
|
||||||
|
33. #1176: syscall: the signal received during system call processing is not processed.
|
||||||
|
34. #1036 syscall_time: Handle by McKernel
|
||||||
|
35. #1165 do_syscall: Delegate system calls to the mcexec with the same pid
|
||||||
|
36. #1194 execve: Fix calling ptrace_report_signal after preemption is disabled
|
||||||
|
37. #1005 coredump: Exclude special areas
|
||||||
|
38. #1018 procfs: Fix pread/pwrite to procfs fail when specified size is bigger than 4MB
|
||||||
|
39. #1180 sched_setaffinity: Check migration after decrementing in_interrupt
|
||||||
|
40. #771, #1179, #1143 ptrace supports threads
|
||||||
|
41. #1189 procfs/do_fork: wait until procfs entries are registered
|
||||||
|
42. #1114 procfs: add '/proc/pid/stat' to mckernel side and fix its comm
|
||||||
|
43. #1116 mcctrl procfs: check entry was returned before using it
|
||||||
|
44. #1167 ihk_os_getperfevent,setperfevent: Return -ETIME when IKC timeouts
|
||||||
|
45. mcexec/execve: fix shebangs handling
|
||||||
|
46. procfs: handle 'comm' on mckernel side
|
||||||
|
47. ihk_os_setperfevent: Return number of registered events
|
||||||
|
48. mcexec: fix terminating zero after readlink()
|
||||||
|
|
||||||
|
===========================================
|
||||||
|
What's new in V1.5.1 (July 9, 2018)
|
||||||
|
===========================================
|
||||||
|
|
||||||
|
-----------------------------------------------
|
||||||
|
McKernel new features, improvements and changes
|
||||||
|
-----------------------------------------------
|
||||||
|
1. Watchdog timer to detect hang of McKernel
|
||||||
|
mcexec prints out the following line to its stderr when a hang of
|
||||||
|
McKernel is detected.
|
||||||
|
|
||||||
|
mcexec detected hang of McKernel
|
||||||
|
|
||||||
|
The watchdog timer is enabled by passing -i <timeout_in_sec> option
|
||||||
|
to mcreboot.sh. <timeout_in_sec> specifies the interval of checking
|
||||||
|
if McKernel is alive.
|
||||||
|
Example: mcreboot.sh -i 600: Detect the hang with 10 minutes interval
|
||||||
|
|
||||||
|
The detailed step of the hang detection is as follows.
|
||||||
|
(1) mcexec acquires eventfd for notification from IHK and perform
|
||||||
|
epoll() on it.
|
||||||
|
(2) A daemon called ihkmond monitors the state of McKernel periodically
|
||||||
|
with the interval specified by the -i option. It judges that
|
||||||
|
McKernel is hanging and notifies mcexec by the eventfd if its
|
||||||
|
state hasn't changed since the last check.
|
||||||
|
|
||||||
|
2. Documentation
|
||||||
|
man page: Installed directory is changed to <install_dir>/share/man
|
||||||
|
|
||||||
|
---------------------------
|
||||||
|
McKernel bug fixes (digest)
|
||||||
|
---------------------------
|
||||||
|
1. #1146: pager_req_map(): do not take mmap_sem if not needed
|
||||||
|
2. #1135: prepare_process_ranges_args_envs(): fix saving cmdline
|
||||||
|
3. #1144: fileobj/devobj: record path name
|
||||||
|
4. #1145: fileobj: use MCS locks for per-file page hash
|
||||||
|
5. #1076: mcctrl: refactor prepare_image into new generic ikc send&wait
|
||||||
|
6. #1072: execve: fix execve with oversubscribing
|
||||||
|
7. #1132: execve: use thread variable instead of cpu_local_var(current)
|
||||||
|
8. #1117: mprotect: do not set page table writable for cow pages
|
||||||
|
9. #1143: syscall wait4: add _WALL (POSTK_DEBUG_ARCH_DEP_44)
|
||||||
|
10. #1064: rusage: Fix initialization of rusage->num_processors
|
||||||
|
11. #1133: pager_req_unmap: Put per-process data at exit
|
||||||
|
12. #731: do_fork: Propagate error code returned by mcexec
|
||||||
|
13. #1149: execve: Reinitialize vm_regions's map area on execve
|
||||||
|
14. #1065: procfs: Show file names in /proc/<PID>/maps
|
||||||
|
15. #1112: mremap: Fix type of size arguments (from ssize_t to size_t)
|
||||||
|
16. #1121: sched_getaffinity: Check arguments in the same order as in Linux
|
||||||
|
17. #1137: mmap, mremap: Check arguments in the same order as in Linux
|
||||||
|
18. #1122: fix return value of sched_getaffinity
|
||||||
|
19. #732: fix: /proc/<PID>/maps outputs a unnecessary NULL character
|
||||||
|
|
||||||
|
===================================
|
||||||
|
What's new in V1.5.0 (Apr 5, 2018)
|
||||||
|
===================================
|
||||||
|
|
||||||
|
--------------------------------------
|
||||||
|
McKernel new features and improvements
|
||||||
|
--------------------------------------
|
||||||
|
1. Aid for Linux version migration: Detect /proc, /sys format change
|
||||||
|
between two kernel verions
|
||||||
|
2. Swap out
|
||||||
|
* Only swap-out anonymous pages for now
|
||||||
|
3. Improve support of /proc/maps
|
||||||
|
4. mcstat: Linux tool to show resource usage
|
||||||
|
|
||||||
|
---------------------------
|
||||||
|
McKernel bug fixes (digest)
|
||||||
|
---------------------------
|
||||||
|
1. #727: execve: Fix memory leak when receiving SIGKILL
|
||||||
|
2. #829: perf_event_open: Support PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
|
||||||
|
3. #906: mcexec: Check return code of fork()
|
||||||
|
4. #1038: mcexec: Timeout when incorrect value is given to -n option
|
||||||
|
5. #943 #945 #946 #960 $961: mcexec: Support strace
|
||||||
|
6. #1029: struct thread is not released with stress-test involving signal
|
||||||
|
and futex
|
||||||
|
7. #863 #870: Respond immediately to terminating signal when
|
||||||
|
offloading system call
|
||||||
|
8. #1119: translate_rva_to_rpa(): use 2MB blocks in 1GB pages on x86
|
||||||
|
11. #898: Shutdown OS only after no in-flight IKC exist
|
||||||
|
12. #882: release_handler: Destroy objects as the process which opened it
|
||||||
|
13. #882: mcexec: Make child process exit if the parent is killed during
|
||||||
|
fork()
|
||||||
|
14. #925: XPMEM: Don't destroy per-process object of the parent
|
||||||
|
15. #885: ptrace: Support the case where a process attaches its child
|
||||||
|
16. #1031: sigaction: Support SA_RESETHAND
|
||||||
|
17. #923: rus_vm_fault: Return error when a thread not performing
|
||||||
|
system call offloading causes remote page fault
|
||||||
|
18. #1032 #1033 #1034: getrusage: Fix ru_maxrss, RUSAGE_CHILDREN,
|
||||||
|
ru_stime related bugs
|
||||||
|
19. #1120: getrusage: Fix deadlock on thread->times_update
|
||||||
|
20. #1123: Fix deadlock related to wait_queue_head_list_node
|
||||||
|
21. #1124: Fix deadlock of calling terminate() from terminate()
|
||||||
|
22. #1125: Fix deadlock related to thread status
|
||||||
|
* Related functions are: hold_thread(), do_kill() and terminate()
|
||||||
|
23. #1126: uti: Fix uti thread on the McKernel side blocks others in do_syscall()
|
||||||
|
24. #1066: procfs: Show Linux /proc/self/cgroup
|
||||||
|
25. #1127: prepare_process_ranges_args_envs(): fix generating saved_cmdline to
|
||||||
|
avoid PF in strlen()
|
||||||
|
26. #1128: ihk_mc_map/unmap_virtual(): do proper TLB invalidation
|
||||||
|
27. #1043: terminate(): fix update_lock and threads_lock order to avoid deadlock
|
||||||
|
28. #1129: mcreboot.sh: Save /proc/irq/*/smp_affinity to /tmp/mcreboot
|
||||||
|
29. #1130: mcexec: drop READ_IMPLIES_EXEC from personality
|
||||||
|
|
||||||
|
--------------------
|
||||||
|
McKernel workarounds
|
||||||
|
--------------------
|
||||||
|
1. Forbid CPU oversubscription
|
||||||
|
* It can be turned on by mcreboot.sh -O option
|
||||||
|
|
||||||
|
|
||||||
|
===================================
|
||||||
|
What's new in V1.4.0 (Oct 30, 2017)
|
||||||
|
===================================
|
||||||
|
|
||||||
|
-----------------------------------------------------------
|
||||||
|
Feature: Abstracted event type support in perf_event_open()
|
||||||
|
-----------------------------------------------------------
|
||||||
|
PERF_TYPE_HARDWARE and PERF_TYPE_CACHE types are supported.
|
||||||
|
|
||||||
|
----------------------------------
|
||||||
|
Clean-up: Direct user-space access
|
||||||
|
----------------------------------
|
||||||
|
Code lines using direct user-space access (e.g. passing user-space
|
||||||
|
pointer to memcpy()) becomes more portable across processor
|
||||||
|
architectures. The modification follows the following rules.
|
||||||
|
|
||||||
|
1. Move the code section as it is to the architecture dependent
|
||||||
|
directory if it is a part of the critical-path.
|
||||||
|
2. Otherwise, rewrite the code section by using the portable methods.
|
||||||
|
The methods include copy_from_user(), copy_to_user(),
|
||||||
|
pte_get_phys() and phys_to_virt().
|
||||||
|
|
||||||
|
--------------------------------
|
||||||
|
Test: MPI and OpenMP micro-bench
|
||||||
|
--------------------------------
|
||||||
|
The performance figures of MPI and OpenMP primitives are compared with
|
||||||
|
those of Linux by using Intel MPI Benchmarks and EPCC OpenMP Micro
|
||||||
|
Benchmark.
|
||||||
|
|
||||||
|
|
||||||
|
===================================
|
||||||
|
What's new in V1.3.0 (Sep 30, 2017)
|
||||||
|
===================================
|
||||||
|
|
||||||
|
--------------------
|
||||||
|
Feature: Kernel dump
|
||||||
|
--------------------
|
||||||
|
1. A dump level of "only kernel memory" is added.
|
||||||
|
|
||||||
|
The following two levels are available now:
|
||||||
|
0: Dump all
|
||||||
|
24: Dump only kernel memory
|
||||||
|
|
||||||
|
The dump level can be set by -d option in ihkosctl or the argument
|
||||||
|
for ihk_os_makedumpfile(), as shown in the following examples:
|
||||||
|
|
||||||
|
Command: ihkosctl 0 dump -d 24
|
||||||
|
Function call: ihk_os_makedumpfile(0, NULL, 24, 0);
|
||||||
|
|
||||||
|
2. Dump file is created when Linux panics.
|
||||||
|
|
||||||
|
The dump level can be set by dump_level kernel argument, as shown in the
|
||||||
|
following example:
|
||||||
|
|
||||||
|
ihkosctl 0 kargs "hidos dump_level=24"
|
||||||
|
|
||||||
|
The IHK dump function is registered to panic_notifier_list when creating
|
||||||
|
/dev/mcdX and called when Linux panics.
|
||||||
|
|
||||||
|
-----------------------------
|
||||||
|
Feature: Quick Process Launch
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
MPI process launch time and some of the initialization time can be
|
||||||
|
reduced in application consisting of multiple MPI programs which are
|
||||||
|
launched in turn in the job script.
|
||||||
|
|
||||||
|
The following two steps should be performed to use this feature:
|
||||||
|
1. Replace mpiexec with ql_mpiexec_start and add some lines for
|
||||||
|
ql_mpiexec_finalize in the job script
|
||||||
|
2. Modify the app so that it can repeat calculations and wait for the
|
||||||
|
instructions from ql_mpiexec_{start,finalize} at the end of the
|
||||||
|
loop
|
||||||
|
|
||||||
|
The first step is explained using an example. Assume the original job
|
||||||
|
script looks like this:
|
||||||
|
|
||||||
|
/* Execute ensamble simulation and then data assimilation, and repeat this
|
||||||
|
ten times */
|
||||||
|
for i in {1..10}; do
|
||||||
|
|
||||||
|
/* Each ensamble simulation execution uses 100 nodes, launch ten of them
|
||||||
|
in parallel */
|
||||||
|
for j in {1..10}; do
|
||||||
|
mpiexec -n 100 -machinefile ./list1_$j p1.out a1 & pids[$i]=$!;
|
||||||
|
done
|
||||||
|
|
||||||
|
/* Wait until the ten ensamble simulation programs finish */
|
||||||
|
for j in {1..10}; do wait ${pids[$j]}; done
|
||||||
|
|
||||||
|
/* Launch one data assimilation program using 1000 nodes */
|
||||||
|
mpiexec -n 1000 -machinefile ./list2 p2.out a2
|
||||||
|
done
|
||||||
|
|
||||||
|
The job script should be modified like this:
|
||||||
|
|
||||||
|
for i in {1..10}; do
|
||||||
|
for j in {1..10}; do
|
||||||
|
/* Replace mpiexec with ql_mpiexec_start */
|
||||||
|
ql_mpiexec_start -n 100 -machinefile ./list1_$j p1.out a1 & pids[$j]=$!;
|
||||||
|
done
|
||||||
|
|
||||||
|
for j in {1..10}; do wait ${pids[$j]}; done
|
||||||
|
|
||||||
|
ql_mpiexec_start -n 1000 -machinefile ./list2 p2.out a2
|
||||||
|
done
|
||||||
|
|
||||||
|
/* p1.out and p2.out don't exit but are waiting for the next calculation.
|
||||||
|
So tell them to exit */
|
||||||
|
for j in {1..10}; do
|
||||||
|
ql_mpiexec_finalize -machinefile ./list1_$i p1.out a1;
|
||||||
|
done
|
||||||
|
ql_mpiexec_finalize -machinefile ./list2 p2.out a2;
|
||||||
|
|
||||||
|
|
||||||
|
The second step is explained using a pseudo-code.
|
||||||
|
|
||||||
|
MPI_Init();
|
||||||
|
Prepare data exchange with preceding / following MPI programs
|
||||||
|
loop:
|
||||||
|
foreach Fortran module
|
||||||
|
Initialize data using command-line argments, parameter files,
|
||||||
|
environment variables
|
||||||
|
Input data from preceding MPI programs / Read snap-shot
|
||||||
|
Perform main calculation
|
||||||
|
Output data to following MPI programs / Write snap-shot
|
||||||
|
/* ql_client() waits for command of ql_mpiexec_{start,finish} */
|
||||||
|
if (ql_client() == QL_CONTINUE) { goto loop; }
|
||||||
|
MPI_Finalize();
|
||||||
|
|
||||||
|
qlmpilib.h should be included in the code and libql{mpi,fort}.so
|
||||||
|
should be linked to the executable file.
|
||||||
|
|
||||||
|
|
||||||
|
========================
|
||||||
|
Restrictions on McKernel
|
||||||
|
========================
|
||||||
|
|
||||||
|
1. Pseudo devices such as /dev/mem and /dev/zero are not mmap()ed
|
||||||
|
correctly even if the mmap() returns a success. An access of their
|
||||||
|
mapping receives the SIGSEGV signal.
|
||||||
|
|
||||||
|
2. clone() supports only the following flags. All the other flags
|
||||||
|
cause clone() to return error or are simply ignored.
|
||||||
|
|
||||||
|
* CLONE_CHILD_CLEARTID
|
||||||
|
* CLONE_CHILD_SETTID
|
||||||
|
* CLONE_PARENT_SETTID
|
||||||
|
* CLONE_SETTLS
|
||||||
|
* CLONE_SIGHAND
|
||||||
|
* CLONE_VM
|
||||||
|
|
||||||
|
3. PAPI has the following restriction.
|
||||||
|
|
||||||
|
* Number of counters a user can use at the same time is up to the
|
||||||
|
number of the physical counters in the processor.
|
||||||
|
|
||||||
|
4. msync writes back only the modified pages mapped by the calling process.
|
||||||
|
|
||||||
|
5. The following syscalls always return the ENOSYS error.
|
||||||
|
|
||||||
|
* migrate_pages()
|
||||||
|
* move_pages()
|
||||||
|
* set_robust_list()
|
||||||
|
|
||||||
|
6. The following syscalls always return the EOPNOTSUPP error.
|
||||||
|
|
||||||
|
* arch_prctl(ARCH_SET_GS)
|
||||||
|
* signalfd()
|
||||||
|
|
||||||
|
7. signalfd4() returns a fd, but signal is not notified through the
|
||||||
|
fd.
|
||||||
|
|
||||||
|
8. set_rlimit sets the limit values but they are not enforced.
|
||||||
|
|
||||||
|
9. Address randomization is not supported.
|
||||||
|
|
||||||
|
10. brk() extends the heap more than requestd when -h
|
||||||
|
(--extend-heap-by=)<step> option of mcexec is used with the value
|
||||||
|
larger than 4 KiB. syscall_pwrite02 of LTP would fail for this
|
||||||
|
reason. This is because the test expects that the end of the heap
|
||||||
|
is set to the same address as the argument of sbrk() and expects a
|
||||||
|
segmentation violation occurs when it tries to access the memory
|
||||||
|
area right next to the boundary. However, the optimization sets
|
||||||
|
the end to a value larger than the requested. Therefore, the
|
||||||
|
expected segmentation violation doesn't occur.
|
||||||
|
|
||||||
|
11. setpriority()/getpriority() won't work. They might set/get the
|
||||||
|
priority of a random mcexec thread. This is because there's no
|
||||||
|
fixed correspondence between a McKernel thread which issues the
|
||||||
|
system call and a mcexec thread which handles the offload request.
|
||||||
|
|
||||||
|
12. mbind() can set the policy but it is not used when allocating
|
||||||
|
physical pages.
|
||||||
|
|
||||||
|
13. MPOL_F_RELATIVE_NODES and MPOL_INTERLEAVE flags for
|
||||||
|
set_mempolicy()/mbind() are not supported.
|
||||||
|
|
||||||
|
14. The MPOL_BIND policy for set_mempolicy()/mbind() works as the same
|
||||||
|
as the MPOL_PREFERRED policy. That is, the physical page allocator
|
||||||
|
doesn't give up the allocation when the specified nodes are
|
||||||
|
running out of pages but continues to search pages in the other
|
||||||
|
nodes.
|
||||||
|
|
||||||
|
15. Kernel dump on Linux panic requires Linux kernel CentOS-7.4 and
|
||||||
|
later. In addition, crash_kexec_post_notifiers kernel argument
|
||||||
|
must be given to Linux kernel.
|
||||||
|
|
||||||
|
16. setfsuid()/setfsgid() cannot change the id of the calling thread.
|
||||||
|
Instead, it changes that of the mcexec worker thread which takes
|
||||||
|
the system-call offload request.
|
||||||
|
|
||||||
|
17. mmap (hugeTLBfs): The physical pages corresponding to a map are
|
||||||
|
released when no McKernel process exist. The next map gets fresh
|
||||||
|
physical pages.
|
||||||
|
|
||||||
|
18. Sticky bit on executable file has no effect.
|
||||||
|
|
||||||
|
19. Linux (RHEL-7 for x86_64) could hang when offlining CPUs in the
|
||||||
|
process of booting McKernel due to the Linux bug, found in
|
||||||
|
Linux-3.10 and fixed in the later version. One way to circumvent
|
||||||
|
this is to always assign the same CPU set to McKernel.
|
||||||
|
|
||||||
|
20. madvise:
|
||||||
|
* MADV_HWPOISON and MADV_SOFT_OFFLINE always returns -EPERM.
|
||||||
|
* MADV_MERGEABLE and MADV_UNMERGEABLE always returns -EINVAL.
|
||||||
|
* MADV_HUGEPAGE and MADV_NOHUGEPAGE on file map returns -EINVAL
|
||||||
|
(It succeeds on RHEL-8 for aarch64).
|
||||||
|
|
||||||
|
21. brk() and mmap() doesn't report out-of-memory through its return
|
||||||
|
value. Instead, page-fault reports the error.
|
||||||
|
|
||||||
|
22. Anonymous mmap pre-maps requested number of pages when contiguous
|
||||||
|
pages are available. Demand paging is used when not available.
|
||||||
|
|
||||||
|
23. Mixing page sizes in anonymous shared mapping is not allowed. mmap
|
||||||
|
creates vm_range with one page size. And munmap or mremap that
|
||||||
|
needs the reduced page size changes the sizes of all the pages of
|
||||||
|
the vm_range.
|
||||||
|
|
||||||
|
24. ihk_os_getperfevent() could time-out when invoked from Fujitsu TCS
|
||||||
|
(job-scheduler).
|
||||||
109
README.md
109
README.md
@@ -10,7 +10,7 @@ IHK/McKernel is a light-weight multi-kernel operating system designed for high-e
|
|||||||
|
|
||||||
## Contents
|
## Contents
|
||||||
|
|
||||||
- [Background] (#background)
|
- [Background](#background-and-motivation)
|
||||||
- [Architectural Overview](#architectural-overview)
|
- [Architectural Overview](#architectural-overview)
|
||||||
- [Installation](#installation)
|
- [Installation](#installation)
|
||||||
- [The Team](#the-team)
|
- [The Team](#the-team)
|
||||||
@@ -85,7 +85,7 @@ sudo reboot
|
|||||||
You will need the following packages installed:
|
You will need the following packages installed:
|
||||||
|
|
||||||
~~~~
|
~~~~
|
||||||
sudo yum install kernel-devel binutils-devel libnuma-devel
|
sudo yum install cmake kernel-devel binutils-devel systemd-devel numactl-devel gcc make nasm git
|
||||||
~~~~
|
~~~~
|
||||||
|
|
||||||
Grant read permission to the System.map file of your kernel version:
|
Grant read permission to the System.map file of your kernel version:
|
||||||
@@ -96,24 +96,51 @@ sudo chmod a+r /boot/System.map-`uname -r`
|
|||||||
|
|
||||||
##### 4. Obtain sources and compile the kernel
|
##### 4. Obtain sources and compile the kernel
|
||||||
|
|
||||||
Clone the source code and set up ihk symlink (this is currently required):
|
Clone the source code:
|
||||||
|
|
||||||
~~~~
|
~~~~
|
||||||
mkdir -p ~/src/ihk+mckernel/
|
mkdir -p ~/src/ihk+mckernel/
|
||||||
cd ~/src/ihk+mckernel/
|
cd ~/src/ihk+mckernel/
|
||||||
git clone -r git@github.com:RIKEN-SysSoft/mckernel.git
|
git clone --recursive -b development https://github.com/RIKEN-SysSoft/mckernel.git
|
||||||
~~~~
|
~~~~
|
||||||
|
|
||||||
|
(Optional) Checkout to the specific branch or version:
|
||||||
|
|
||||||
|
~~~~
|
||||||
|
cd mckernel
|
||||||
|
git checkout <pathspec>
|
||||||
|
git submodule update
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
Foe example, if you want to try the development branch, use "development" as the pathspec. If you want to try the prerelease version 1.7.0-0.2, use "1.7.0-0.2".
|
||||||
|
|
||||||
|
###### 4.1 Install with cmake
|
||||||
|
|
||||||
Configure and compile:
|
Configure and compile:
|
||||||
|
|
||||||
~~~~
|
~~~~
|
||||||
mkdir -p build && cd build
|
mkdir -p build && cd build
|
||||||
cmake -DCMAKE_INSTALL_PREFIX=${HOME}/ihk+mckernel $HOME/src/mckernel
|
cmake -DCMAKE_INSTALL_PREFIX=${HOME}/ihk+mckernel $HOME/src/ihk+mckernel/mckernel
|
||||||
make -j install
|
make -j install
|
||||||
~~~~
|
~~~~
|
||||||
|
|
||||||
The IHK kernel modules and McKernel kernel image should be installed under the **ihk+mckernel** folder in your home directory.
|
The IHK kernel modules and McKernel kernel image should be installed under the **ihk+mckernel** folder in your home directory.
|
||||||
|
|
||||||
|
###### 4.2 Install with rpm
|
||||||
|
|
||||||
|
Configure, compile and build rpm:
|
||||||
|
|
||||||
|
~~~~
|
||||||
|
mkdir -p build && cd build
|
||||||
|
cmake $HOME/src/ihk+mckernel/mckernel
|
||||||
|
make dist
|
||||||
|
cp mckernel-<version>.tar.gz <rpmbuild>/SOURCES
|
||||||
|
rpm -ba scripts/mckernel.spec
|
||||||
|
sudo rpm -ivh <rpmbuild>/RPMS/<arch>/mckernel-<version>-<release>_<linux_kernel_ver>_<dist>.<arch>.rpm
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
The IHK kernel modules and McKernel kernel image are installed under the system directory.
|
||||||
|
|
||||||
##### 5. Boot McKernel
|
##### 5. Boot McKernel
|
||||||
|
|
||||||
A boot script called mcreboot.sh is provided under sbin in the install folder. To boot on logical CPU 1 with 512MB of memory, use the following invocation:
|
A boot script called mcreboot.sh is provided under sbin in the install folder. To boot on logical CPU 1 with 512MB of memory, use the following invocation:
|
||||||
@@ -170,6 +197,71 @@ Finally, to shutdown McKernel and release CPU/memory resources back to Linux use
|
|||||||
sudo ./sbin/mcstop+release.sh
|
sudo ./sbin/mcstop+release.sh
|
||||||
~~~~
|
~~~~
|
||||||
|
|
||||||
|
##### 7. Advanced: Enable Utility Thread offloading Interface (UTI)
|
||||||
|
|
||||||
|
UTI enables a runtime such as MPI runtime to spawn utility threads such as MPI asynchronous progress threads to Linux cores.
|
||||||
|
|
||||||
|
1. Install capstone
|
||||||
|
|
||||||
|
Install EPEL capstone-devel:
|
||||||
|
|
||||||
|
~~~~
|
||||||
|
sudo yum install epel-release
|
||||||
|
sudo yum install capstone-devel
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
2. Install syscall_intercept
|
||||||
|
|
||||||
|
~~~~
|
||||||
|
git clone https://github.com/RIKEN-SysSoft/syscall_intercept.git
|
||||||
|
cmake ../arch/aarch64 -DCMAKE_INSTALL_PREFIX=<syscall-intercept-install> -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DTREAT_WARNINGS_AS_ERRORS=OFF
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
3. Install UTI for McKernel
|
||||||
|
|
||||||
|
Install:
|
||||||
|
|
||||||
|
~~~~
|
||||||
|
git clone https://github.com/RIKEN-SysSoft/uti.git
|
||||||
|
mkdir build && cd build
|
||||||
|
../uti/configure --prefix=<mckernel-install> --with-rm=mckernel
|
||||||
|
make && make install
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
4. Install McKernel
|
||||||
|
|
||||||
|
~~~~
|
||||||
|
CMAKE_PREFIX_PATH=<syscall-intercept-install> cmake -DCMAKE_INSTALL_PREFIX=${HOME}/ihk+mckernel -DENABLE_UTI=ON $HOME/src/ihk+mckernel/mckernel
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
5. Run executable
|
||||||
|
|
||||||
|
~~~~
|
||||||
|
mcexec --enable-uti <command>
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
6. Install UTI for Linux for performance comparison
|
||||||
|
|
||||||
|
Install by make:
|
||||||
|
|
||||||
|
~~~~
|
||||||
|
git clone https://github.com/RIKEN-SysSoft/uti.git
|
||||||
|
mkdir build && cd build
|
||||||
|
../uti/configure --prefix=<uti-install> --with-rm=linux
|
||||||
|
make && make install
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
Install by rpm:
|
||||||
|
|
||||||
|
~~~~
|
||||||
|
git clone https://github.com/RIKEN-SysSoft/uti.git
|
||||||
|
mkdir build && cd build
|
||||||
|
../uti/configure --prefix=<uti-install> --with-rm=linux
|
||||||
|
rm -f ~/rpmbuild/SOURCES/<version>.tar.gz
|
||||||
|
rpmbuild -ba ./scripts/uti.spec
|
||||||
|
rpm -Uvh uti-<version>-<release>-<arch>.rpm
|
||||||
|
~~~~
|
||||||
|
|
||||||
## The Team
|
## The Team
|
||||||
|
|
||||||
The McKernel project was started at The University of Tokyo and currently it is mainly developed at RIKEN.
|
The McKernel project was started at The University of Tokyo and currently it is mainly developed at RIKEN.
|
||||||
@@ -184,3 +276,10 @@ Some of our collaborators include:
|
|||||||
## License
|
## License
|
||||||
|
|
||||||
McKernel is GPL licensed, as found in the LICENSE file.
|
McKernel is GPL licensed, as found in the LICENSE file.
|
||||||
|
|
||||||
|
## Contact
|
||||||
|
|
||||||
|
Please give your feedback to us via one of the following mailing lists. Subscription via [www.pccluster.org](http://www.pccluster.org/mailman/listinfo/mckernel-users) is needed.
|
||||||
|
|
||||||
|
* English: mckernel-users@pccluster.org
|
||||||
|
* Japanese: mckernel-users-jp@pccluster.org
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
/* assert.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
/* assert.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||||
|
|
||||||
#include <process.h>
|
#include <process.h>
|
||||||
#include <list.h>
|
#include <list.h>
|
||||||
@@ -53,4 +53,4 @@ STATIC_ASSERT(SVE_PT_FPSIMD_OFFSET == sizeof(struct user_sve_header));
|
|||||||
STATIC_ASSERT(SVE_PT_SVE_OFFSET == sizeof(struct user_sve_header));
|
STATIC_ASSERT(SVE_PT_SVE_OFFSET == sizeof(struct user_sve_header));
|
||||||
|
|
||||||
/* assert for struct arm64_cpu_local_thread member offset define */
|
/* assert for struct arm64_cpu_local_thread member offset define */
|
||||||
STATIC_ASSERT(offsetof(struct arm64_cpu_local_thread, panic_regs) == 160);
|
STATIC_ASSERT(offsetof(struct arm64_cpu_local_thread, panic_regs) == 168);
|
||||||
|
|||||||
@@ -1,9 +1,15 @@
|
|||||||
/* coredump.c COPYRIGHT FUJITSU LIMITED 2015-2016 */
|
/* coredump.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||||
#include <process.h>
|
#include <process.h>
|
||||||
#include <elfcore.h>
|
#include <elfcore.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <ptrace.h>
|
||||||
|
#include <cls.h>
|
||||||
|
#include <hwcap.h>
|
||||||
|
|
||||||
void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread, void *regs0)
|
#define align32(x) ((((x) + 3) / 4) * 4)
|
||||||
|
|
||||||
|
void arch_fill_prstatus(struct elf_prstatus64 *prstatus,
|
||||||
|
struct thread *thread, void *regs0, int sig)
|
||||||
{
|
{
|
||||||
struct pt_regs *regs = regs0;
|
struct pt_regs *regs = regs0;
|
||||||
struct elf_prstatus64 tmp_prstatus;
|
struct elf_prstatus64 tmp_prstatus;
|
||||||
@@ -14,8 +20,6 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
|
|||||||
short int pr_cursig;
|
short int pr_cursig;
|
||||||
a8_uint64_t pr_sigpend;
|
a8_uint64_t pr_sigpend;
|
||||||
a8_uint64_t pr_sighold;
|
a8_uint64_t pr_sighold;
|
||||||
pid_t pr_pid;
|
|
||||||
pid_t pr_ppid;
|
|
||||||
pid_t pr_pgrp;
|
pid_t pr_pgrp;
|
||||||
pid_t pr_sid;
|
pid_t pr_sid;
|
||||||
struct prstatus64_timeval pr_utime;
|
struct prstatus64_timeval pr_utime;
|
||||||
@@ -23,10 +27,66 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
|
|||||||
struct prstatus64_timeval pr_cutime;
|
struct prstatus64_timeval pr_cutime;
|
||||||
struct prstatus64_timeval pr_cstime;
|
struct prstatus64_timeval pr_cstime;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* copy x0-30, sp, pc, pstate */
|
/* copy x0-30, sp, pc, pstate */
|
||||||
memcpy(&tmp_prstatus.pr_reg, ®s->user_regs, sizeof(tmp_prstatus.pr_reg));
|
memcpy(&tmp_prstatus.pr_reg, ®s->user_regs, sizeof(tmp_prstatus.pr_reg));
|
||||||
tmp_prstatus.pr_fpvalid = 0; /* We assume no fp */
|
tmp_prstatus.pr_fpvalid = 0; /* We assume no fp */
|
||||||
|
|
||||||
/* copy unaligned prstatus addr */
|
/* copy unaligned prstatus addr */
|
||||||
memcpy(prstatus, &tmp_prstatus, sizeof(*prstatus));
|
memcpy(prstatus, &tmp_prstatus, sizeof(*prstatus));
|
||||||
|
|
||||||
|
prstatus->pr_pid = thread->tid;
|
||||||
|
if (thread->proc->parent) {
|
||||||
|
prstatus->pr_ppid = thread->proc->parent->pid;
|
||||||
|
}
|
||||||
|
|
||||||
|
prstatus->pr_info.si_signo = sig;
|
||||||
|
prstatus->pr_cursig = sig;
|
||||||
|
}
|
||||||
|
|
||||||
|
int arch_get_thread_core_info_size(void)
|
||||||
|
{
|
||||||
|
const struct user_regset_view *view = current_user_regset_view();
|
||||||
|
const struct user_regset *regset = find_regset(view, NT_ARM_SVE);
|
||||||
|
|
||||||
|
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return sizeof(struct note) + align32(sizeof("LINUX"))
|
||||||
|
+ regset_size(cpu_local_var(current), regset);
|
||||||
|
}
|
||||||
|
|
||||||
|
void arch_fill_thread_core_info(struct note *head,
|
||||||
|
struct thread *thread, void *regs)
|
||||||
|
{
|
||||||
|
const struct user_regset_view *view = current_user_regset_view();
|
||||||
|
const struct user_regset *regset = find_regset(view, NT_ARM_SVE);
|
||||||
|
|
||||||
|
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* pre saved registers */
|
||||||
|
save_fp_regs(thread);
|
||||||
|
|
||||||
|
if (regset->core_note_type && regset->get &&
|
||||||
|
(!regset->active || regset->active(thread, regset))) {
|
||||||
|
int ret;
|
||||||
|
size_t size = regset_size(thread, regset);
|
||||||
|
void *namep;
|
||||||
|
void *descp;
|
||||||
|
|
||||||
|
namep = (void *) (head + 1);
|
||||||
|
descp = namep + align32(sizeof("LINUX"));
|
||||||
|
|
||||||
|
ret = regset->get(thread, regset, 0, size, descp, NULL);
|
||||||
|
if (ret) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
head->namesz = sizeof("LINUX");
|
||||||
|
head->descsz = size;
|
||||||
|
head->type = NT_ARM_SVE;
|
||||||
|
memcpy(namep, "LINUX", sizeof("LINUX"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
/* cpu.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
/* cpu.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||||
#include <ihk/cpu.h>
|
#include <ihk/cpu.h>
|
||||||
#include <ihk/debug.h>
|
|
||||||
#include <ihk/mm.h>
|
#include <ihk/mm.h>
|
||||||
#include <types.h>
|
#include <types.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
@@ -30,9 +29,11 @@
|
|||||||
#include <debug-monitors.h>
|
#include <debug-monitors.h>
|
||||||
#include <sysreg.h>
|
#include <sysreg.h>
|
||||||
#include <cpufeature.h>
|
#include <cpufeature.h>
|
||||||
#include <debug.h>
|
#include <ihk/debug.h>
|
||||||
#include <hwcap.h>
|
#include <hwcap.h>
|
||||||
#include <virt.h>
|
#include <virt.h>
|
||||||
|
#include <init.h>
|
||||||
|
#include <bootparam.h>
|
||||||
|
|
||||||
//#define DEBUG_PRINT_CPU
|
//#define DEBUG_PRINT_CPU
|
||||||
|
|
||||||
@@ -67,6 +68,7 @@ void (*gic_dist_init)(unsigned long dist_base_pa, unsigned long size);
|
|||||||
void (*gic_cpu_init)(unsigned long cpu_base_pa, unsigned long size);
|
void (*gic_cpu_init)(unsigned long cpu_base_pa, unsigned long size);
|
||||||
void (*gic_enable)(void);
|
void (*gic_enable)(void);
|
||||||
void (*arm64_issue_ipi)(unsigned int cpid, unsigned int vector);
|
void (*arm64_issue_ipi)(unsigned int cpid, unsigned int vector);
|
||||||
|
void (*arm64_issue_host_ipi)(unsigned int cpid, unsigned int vector);
|
||||||
void (*handle_arch_irq)(struct pt_regs *);
|
void (*handle_arch_irq)(struct pt_regs *);
|
||||||
|
|
||||||
static void gic_init(void)
|
static void gic_init(void)
|
||||||
@@ -77,14 +79,18 @@ static void gic_init(void)
|
|||||||
gic_cpu_init = gic_cpu_init_gicv3;
|
gic_cpu_init = gic_cpu_init_gicv3;
|
||||||
gic_enable = gic_enable_gicv3;
|
gic_enable = gic_enable_gicv3;
|
||||||
arm64_issue_ipi = arm64_issue_ipi_gicv3;
|
arm64_issue_ipi = arm64_issue_ipi_gicv3;
|
||||||
|
arm64_issue_host_ipi = arm64_issue_host_ipi_gicv3;
|
||||||
handle_arch_irq = handle_interrupt_gicv3;
|
handle_arch_irq = handle_interrupt_gicv3;
|
||||||
|
kprintf("%: GICv3\n", __func__);
|
||||||
} else {
|
} else {
|
||||||
/* Setup functions for GICv2 */
|
/* Setup functions for GICv2 */
|
||||||
gic_dist_init = gic_dist_init_gicv2;
|
gic_dist_init = gic_dist_init_gicv2;
|
||||||
gic_cpu_init = gic_cpu_init_gicv2;
|
gic_cpu_init = gic_cpu_init_gicv2;
|
||||||
gic_enable = gic_enable_gicv2;
|
gic_enable = gic_enable_gicv2;
|
||||||
arm64_issue_ipi = arm64_issue_ipi_gicv2;
|
arm64_issue_ipi = arm64_issue_ipi_gicv2;
|
||||||
|
arm64_issue_host_ipi = arm64_issue_host_ipi_gicv2;
|
||||||
handle_arch_irq = handle_interrupt_gicv2;
|
handle_arch_irq = handle_interrupt_gicv2;
|
||||||
|
kprintf("%: GICv2\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
gic_dist_init(ihk_param_gic_dist_base_pa, ihk_param_gic_dist_map_size);
|
gic_dist_init(ihk_param_gic_dist_base_pa, ihk_param_gic_dist_map_size);
|
||||||
@@ -114,42 +120,94 @@ static struct ihk_mc_interrupt_handler cpu_stop_handler = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
extern long freeze_thaw(void *nmi_ctx);
|
extern long freeze_thaw(void *nmi_ctx);
|
||||||
static void multi_nm_interrupt_handler(void *priv)
|
static void multi_interrupt_handler(void *priv)
|
||||||
{
|
{
|
||||||
extern int nmi_mode;
|
switch (multi_intr_mode) {
|
||||||
struct pt_regs *regs = (struct pt_regs *)priv;
|
|
||||||
union arm64_cpu_local_variables *clv;
|
|
||||||
|
|
||||||
switch (nmi_mode) {
|
|
||||||
case 1:
|
case 1:
|
||||||
case 2:
|
case 2: /* mode == 1or2, for FREEZER intr */
|
||||||
/* mode == 1or2, for FREEZER NMI */
|
dkprintf("%s: freeze mode intr catch. (multi_intr_mode=%d)\n",
|
||||||
dkprintf("%s: freeze mode NMI catch. (nmi_mode=%d)\n",
|
__func__, multi_intr_mode);
|
||||||
__func__, nmi_mode);
|
|
||||||
freeze_thaw(NULL);
|
freeze_thaw(NULL);
|
||||||
break;
|
break;
|
||||||
|
default:
|
||||||
|
ekprintf("%s: Unknown multi-intr-mode(%d) detected.\n",
|
||||||
|
__func__, multi_intr_mode);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void arch_save_panic_regs(void *irq_regs)
|
||||||
|
{
|
||||||
|
struct pt_regs *regs = (struct pt_regs *)irq_regs;
|
||||||
|
union arm64_cpu_local_variables *clv;
|
||||||
|
|
||||||
|
clv = get_arm64_this_cpu_local();
|
||||||
|
|
||||||
|
/* For user-space, use saved kernel context */
|
||||||
|
if (regs->pc < USER_END) {
|
||||||
|
memset(clv->arm64_cpu_local_thread.panic_regs,
|
||||||
|
0, sizeof(clv->arm64_cpu_local_thread.panic_regs));
|
||||||
|
clv->arm64_cpu_local_thread.panic_regs[29] =
|
||||||
|
current_thread_info()->cpu_context.fp;
|
||||||
|
clv->arm64_cpu_local_thread.panic_regs[31] =
|
||||||
|
current_thread_info()->cpu_context.sp;
|
||||||
|
clv->arm64_cpu_local_thread.panic_regs[32] =
|
||||||
|
current_thread_info()->cpu_context.pc;
|
||||||
|
clv->arm64_cpu_local_thread.panic_regs[33] =
|
||||||
|
PSR_MODE_EL1h;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
memcpy(clv->arm64_cpu_local_thread.panic_regs,
|
||||||
|
regs->regs, sizeof(regs->regs));
|
||||||
|
clv->arm64_cpu_local_thread.panic_regs[31] = regs->sp;
|
||||||
|
clv->arm64_cpu_local_thread.panic_regs[32] = regs->pc;
|
||||||
|
clv->arm64_cpu_local_thread.panic_regs[33] =
|
||||||
|
regs->pstate;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
clv->arm64_cpu_local_thread.paniced = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void arch_clear_panic(void)
|
||||||
|
{
|
||||||
|
union arm64_cpu_local_variables *clv;
|
||||||
|
|
||||||
|
clv = get_arm64_this_cpu_local();
|
||||||
|
clv->arm64_cpu_local_thread.paniced = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ihk_mc_interrupt_handler multi_intr_handler = {
|
||||||
|
.func = multi_interrupt_handler,
|
||||||
|
.priv = NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
static void multi_nm_interrupt_handler(void *irq_regs)
|
||||||
|
{
|
||||||
|
extern int nmi_mode;
|
||||||
|
|
||||||
|
dkprintf("%s: ...\n", __func__);
|
||||||
|
switch (nmi_mode) {
|
||||||
case 0:
|
case 0:
|
||||||
/* mode == 0, for MEMDUMP NMI */
|
/* mode == 0, for MEMDUMP NMI */
|
||||||
clv = get_arm64_this_cpu_local();
|
arch_save_panic_regs(irq_regs);
|
||||||
|
|
||||||
if (regs) {
|
|
||||||
memcpy(clv->arm64_cpu_local_thread.panic_regs,
|
|
||||||
regs->regs, sizeof(regs->regs));
|
|
||||||
clv->arm64_cpu_local_thread.panic_regs[31] = regs->sp;
|
|
||||||
clv->arm64_cpu_local_thread.panic_regs[32] = regs->pc;
|
|
||||||
clv->arm64_cpu_local_thread.panic_regs[33] =
|
|
||||||
regs->pstate;
|
|
||||||
}
|
|
||||||
clv->arm64_cpu_local_thread.paniced = 1;
|
|
||||||
ihk_mc_query_mem_areas();
|
ihk_mc_query_mem_areas();
|
||||||
/* memdump-nmi is halted McKernel, break is unnecessary. */
|
/* memdump-nmi is halted McKernel, break is unnecessary. */
|
||||||
/* fall through */
|
/* fall through */
|
||||||
case 3:
|
case 3:
|
||||||
/* mode == 3, for SHUTDOWN-WAIT NMI */
|
/* mode == 3, for SHUTDOWN-WAIT NMI */
|
||||||
while (1) {
|
kprintf("%s: STOP\n", __func__);
|
||||||
|
while (nmi_mode != 4)
|
||||||
cpu_halt();
|
cpu_halt();
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 4:
|
||||||
|
/* mode == 4, continue NMI */
|
||||||
|
arch_clear_panic();
|
||||||
|
if (!ihk_mc_get_processor_id()) {
|
||||||
|
ihk_mc_clear_dump_page_completion();
|
||||||
}
|
}
|
||||||
|
kprintf("%s: RESUME, nmi_mode: %d\n", __func__, nmi_mode);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
@@ -423,6 +481,8 @@ void ihk_mc_init_ap(void)
|
|||||||
|
|
||||||
ihk_mc_register_interrupt_handler(INTRID_CPU_STOP, &cpu_stop_handler);
|
ihk_mc_register_interrupt_handler(INTRID_CPU_STOP, &cpu_stop_handler);
|
||||||
ihk_mc_register_interrupt_handler(INTRID_MULTI_NMI, &multi_nmi_handler);
|
ihk_mc_register_interrupt_handler(INTRID_MULTI_NMI, &multi_nmi_handler);
|
||||||
|
ihk_mc_register_interrupt_handler(INTRID_MULTI_INTR,
|
||||||
|
&multi_intr_handler);
|
||||||
ihk_mc_register_interrupt_handler(
|
ihk_mc_register_interrupt_handler(
|
||||||
ihk_mc_get_vector(IHK_TLB_FLUSH_IRQ_VECTOR_START),
|
ihk_mc_get_vector(IHK_TLB_FLUSH_IRQ_VECTOR_START),
|
||||||
&remote_tlb_flush_handler);
|
&remote_tlb_flush_handler);
|
||||||
@@ -776,6 +836,21 @@ unsigned long cpu_disable_interrupt_save(void)
|
|||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* save ICC_PMR_EL1 & enable interrupt (ICC_PMR_EL1 <= ICC_PMR_EL1_UNMASKED) */
|
||||||
|
unsigned long cpu_enable_interrupt_save(void)
|
||||||
|
{
|
||||||
|
unsigned long flags;
|
||||||
|
unsigned long masked = ICC_PMR_EL1_UNMASKED;
|
||||||
|
|
||||||
|
asm volatile(
|
||||||
|
"mrs_s %0, " __stringify(ICC_PMR_EL1) "\n"
|
||||||
|
"msr_s " __stringify(ICC_PMR_EL1) ",%1"
|
||||||
|
: "=&r" (flags)
|
||||||
|
: "r" (masked)
|
||||||
|
: "memory");
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
#else /* defined(CONFIG_HAS_NMI) */
|
#else /* defined(CONFIG_HAS_NMI) */
|
||||||
|
|
||||||
/* @ref.impl arch/arm64/include/asm/irqflags.h::arch_local_irq_enable */
|
/* @ref.impl arch/arm64/include/asm/irqflags.h::arch_local_irq_enable */
|
||||||
@@ -824,6 +899,20 @@ unsigned long cpu_disable_interrupt_save(void)
|
|||||||
: "memory");
|
: "memory");
|
||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* save PSTATE.DAIF & enable interrupt (PSTATE.DAIF I bit set) */
|
||||||
|
unsigned long cpu_enable_interrupt_save(void)
|
||||||
|
{
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
|
asm volatile(
|
||||||
|
"mrs %0, daif // arch_local_irq_save\n"
|
||||||
|
"msr daifclr, #2"
|
||||||
|
: "=r" (flags)
|
||||||
|
:
|
||||||
|
: "memory");
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
#endif /* defined(CONFIG_HAS_NMI) */
|
#endif /* defined(CONFIG_HAS_NMI) */
|
||||||
|
|
||||||
/* we not have "pause" instruction, instead "yield" instruction */
|
/* we not have "pause" instruction, instead "yield" instruction */
|
||||||
@@ -951,7 +1040,7 @@ void ihk_mc_boot_cpu(int cpuid, unsigned long pc)
|
|||||||
setup_cpu_features();
|
setup_cpu_features();
|
||||||
}
|
}
|
||||||
|
|
||||||
init_sve_vl();
|
sve_setup();
|
||||||
}
|
}
|
||||||
|
|
||||||
/* for ihk_mc_init_context() */
|
/* for ihk_mc_init_context() */
|
||||||
@@ -986,6 +1075,9 @@ void ihk_mc_init_context(ihk_mc_kernel_context_t *new_ctx,
|
|||||||
/* branch in ret_from_fork */
|
/* branch in ret_from_fork */
|
||||||
new_ctx->thread->cpu_context.x19 = (unsigned long)next_function;
|
new_ctx->thread->cpu_context.x19 = (unsigned long)next_function;
|
||||||
|
|
||||||
|
sp -= 16;
|
||||||
|
new_ctx->thread->cpu_context.fp = sp;
|
||||||
|
|
||||||
/* set stack_pointer */
|
/* set stack_pointer */
|
||||||
new_ctx->thread->cpu_context.sp = sp - sizeof(ihk_mc_user_context_t);
|
new_ctx->thread->cpu_context.sp = sp - sizeof(ihk_mc_user_context_t);
|
||||||
|
|
||||||
@@ -1001,9 +1093,10 @@ void ihk_mc_init_context(ihk_mc_kernel_context_t *new_ctx,
|
|||||||
const int lcpuid = ihk_mc_get_processor_id();
|
const int lcpuid = ihk_mc_get_processor_id();
|
||||||
const unsigned long syscallno = current_pt_regs()->syscallno;
|
const unsigned long syscallno = current_pt_regs()->syscallno;
|
||||||
#ifdef CONFIG_ARM64_SVE
|
#ifdef CONFIG_ARM64_SVE
|
||||||
const uint16_t orig_sve_vl = current_thread_info()->sve_vl;
|
struct thread_info *ti = current_thread_info();
|
||||||
const uint16_t orig_sve_vl_onexec = current_thread_info()->sve_vl_onexec;
|
const unsigned int orig_sve_vl = ti->sve_vl;
|
||||||
const uint16_t orig_sve_flags = current_thread_info()->sve_flags;
|
const unsigned int orig_sve_vl_onexec = ti->sve_vl_onexec;
|
||||||
|
const unsigned long orig_sve_flags = ti->sve_flags;
|
||||||
#endif /* CONFIG_ARM64_SVE */
|
#endif /* CONFIG_ARM64_SVE */
|
||||||
|
|
||||||
/* get kernel stack address */
|
/* get kernel stack address */
|
||||||
@@ -1023,6 +1116,9 @@ void ihk_mc_init_context(ihk_mc_kernel_context_t *new_ctx,
|
|||||||
|
|
||||||
/* set stack_pointer */
|
/* set stack_pointer */
|
||||||
new_ctx->thread->cpu_context.sp = sp;
|
new_ctx->thread->cpu_context.sp = sp;
|
||||||
|
/* use the 16 bytes padding in ihk_mc_init_user_process()
|
||||||
|
* as closing frame in the frame chain */
|
||||||
|
new_ctx->thread->cpu_context.fp = sp + sizeof(ihk_mc_user_context_t);
|
||||||
|
|
||||||
/* clear pt_regs area */
|
/* clear pt_regs area */
|
||||||
new_uctx = (ihk_mc_user_context_t *)new_ctx->thread->cpu_context.sp;
|
new_uctx = (ihk_mc_user_context_t *)new_ctx->thread->cpu_context.sp;
|
||||||
@@ -1183,7 +1279,7 @@ long ihk_mc_show_cpuinfo(char *buf, size_t buf_size, unsigned long read_off, int
|
|||||||
|
|
||||||
/* generate strings */
|
/* generate strings */
|
||||||
loff += scnprintf(lbuf + loff, lbuf_size - loff,
|
loff += scnprintf(lbuf + loff, lbuf_size - loff,
|
||||||
"processor\t: %d\n", cpuinfo->hwid);
|
"processor\t: %d\n", i);
|
||||||
loff += scnprintf(lbuf + loff, lbuf_size - loff, "Features\t:");
|
loff += scnprintf(lbuf + loff, lbuf_size - loff, "Features\t:");
|
||||||
|
|
||||||
for (j = 0; hwcap_str[j]; j++) {
|
for (j = 0; hwcap_str[j]; j++) {
|
||||||
@@ -1234,7 +1330,6 @@ err:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int check_and_allocate_fp_regs(struct thread *thread);
|
static int check_and_allocate_fp_regs(struct thread *thread);
|
||||||
void save_fp_regs(struct thread *thread);
|
|
||||||
|
|
||||||
void arch_clone_thread(struct thread *othread, unsigned long pc,
|
void arch_clone_thread(struct thread *othread, unsigned long pc,
|
||||||
unsigned long sp, struct thread *nthread)
|
unsigned long sp, struct thread *nthread)
|
||||||
@@ -1346,11 +1441,15 @@ int ihk_mc_arch_get_special_register(enum ihk_asr_type type,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*@
|
/*@
|
||||||
@ requires \valid_apicid(cpu); // valid APIC ID or not
|
@ requires \valid_cpuid(cpu); // valid CPU logical ID
|
||||||
@ ensures \result == 0
|
@ ensures \result == 0
|
||||||
@*/
|
@*/
|
||||||
int ihk_mc_interrupt_cpu(int cpu, int vector)
|
int ihk_mc_interrupt_cpu(int cpu, int vector)
|
||||||
{
|
{
|
||||||
|
if (cpu < 0 || cpu >= num_processors) {
|
||||||
|
kprintf("%s: invalid CPU id: %d\n", __func__, cpu);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
dkprintf("[%d] ihk_mc_interrupt_cpu: %d\n", ihk_mc_get_processor_id(), cpu);
|
dkprintf("[%d] ihk_mc_interrupt_cpu: %d\n", ihk_mc_get_processor_id(), cpu);
|
||||||
(*arm64_issue_ipi)(cpu, vector);
|
(*arm64_issue_ipi)(cpu, vector);
|
||||||
return 0;
|
return 0;
|
||||||
@@ -1471,8 +1570,7 @@ check_and_allocate_fp_regs(struct thread *thread)
|
|||||||
|
|
||||||
if (!thread->fp_regs) {
|
if (!thread->fp_regs) {
|
||||||
kprintf("error: allocating fp_regs pages\n");
|
kprintf("error: allocating fp_regs pages\n");
|
||||||
result = 1;
|
result = -ENOMEM;
|
||||||
panic("panic: error allocating fp_regs pages");
|
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1481,37 +1579,51 @@ check_and_allocate_fp_regs(struct thread *thread)
|
|||||||
|
|
||||||
#ifdef CONFIG_ARM64_SVE
|
#ifdef CONFIG_ARM64_SVE
|
||||||
if (likely(elf_hwcap & HWCAP_SVE)) {
|
if (likely(elf_hwcap & HWCAP_SVE)) {
|
||||||
sve_alloc(thread);
|
result = sve_alloc(thread);
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_ARM64_SVE */
|
#endif /* CONFIG_ARM64_SVE */
|
||||||
out:
|
out:
|
||||||
|
if (result) {
|
||||||
|
release_fp_regs(thread);
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*@
|
/*@
|
||||||
@ requires \valid(thread);
|
@ requires \valid(thread);
|
||||||
@*/
|
@*/
|
||||||
void
|
int
|
||||||
save_fp_regs(struct thread *thread)
|
save_fp_regs(struct thread *thread)
|
||||||
{
|
{
|
||||||
|
int ret = 0;
|
||||||
if (thread == &cpu_local_var(idle)) {
|
if (thread == &cpu_local_var(idle)) {
|
||||||
return;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (likely(elf_hwcap & (HWCAP_FP | HWCAP_ASIMD))) {
|
if (likely(elf_hwcap & (HWCAP_FP | HWCAP_ASIMD))) {
|
||||||
if (check_and_allocate_fp_regs(thread) != 0) {
|
ret = check_and_allocate_fp_regs(thread);
|
||||||
// alloc error.
|
if (ret) {
|
||||||
return;
|
goto out;
|
||||||
}
|
}
|
||||||
thread_fpsimd_save(thread);
|
thread_fpsimd_save(thread);
|
||||||
}
|
}
|
||||||
|
out:
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void copy_fp_regs(struct thread *from, struct thread *to)
|
int copy_fp_regs(struct thread *from, struct thread *to)
|
||||||
{
|
{
|
||||||
if ((from->fp_regs != NULL) && (check_and_allocate_fp_regs(to) == 0)) {
|
int ret = 0;
|
||||||
memcpy(to->fp_regs, from->fp_regs, sizeof(fp_regs_struct));
|
|
||||||
|
if (from->fp_regs != NULL) {
|
||||||
|
ret = check_and_allocate_fp_regs(to);
|
||||||
|
if (!ret) {
|
||||||
|
memcpy(to->fp_regs,
|
||||||
|
from->fp_regs,
|
||||||
|
sizeof(fp_regs_struct));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void clear_fp_regs(void)
|
void clear_fp_regs(void)
|
||||||
@@ -1626,6 +1738,7 @@ static inline int arch_cpu_mrs(uint32_t sys_reg, uint64_t *val)
|
|||||||
SYSREG_READ_S(IMP_PF_INJECTION_DISTANCE5_EL0);
|
SYSREG_READ_S(IMP_PF_INJECTION_DISTANCE5_EL0);
|
||||||
SYSREG_READ_S(IMP_PF_INJECTION_DISTANCE6_EL0);
|
SYSREG_READ_S(IMP_PF_INJECTION_DISTANCE6_EL0);
|
||||||
SYSREG_READ_S(IMP_PF_INJECTION_DISTANCE7_EL0);
|
SYSREG_READ_S(IMP_PF_INJECTION_DISTANCE7_EL0);
|
||||||
|
SYSREG_READ_S(IMP_PF_PMUSERENR_EL0);
|
||||||
SYSREG_READ_S(IMP_BARRIER_CTRL_EL1);
|
SYSREG_READ_S(IMP_BARRIER_CTRL_EL1);
|
||||||
SYSREG_READ_S(IMP_BARRIER_BST_BIT_EL1);
|
SYSREG_READ_S(IMP_BARRIER_BST_BIT_EL1);
|
||||||
SYSREG_READ_S(IMP_BARRIER_INIT_SYNC_BB0_EL1);
|
SYSREG_READ_S(IMP_BARRIER_INIT_SYNC_BB0_EL1);
|
||||||
@@ -1696,6 +1809,7 @@ static inline int arch_cpu_msr(uint32_t sys_reg, uint64_t val)
|
|||||||
SYSREG_WRITE_S(IMP_PF_INJECTION_DISTANCE5_EL0);
|
SYSREG_WRITE_S(IMP_PF_INJECTION_DISTANCE5_EL0);
|
||||||
SYSREG_WRITE_S(IMP_PF_INJECTION_DISTANCE6_EL0);
|
SYSREG_WRITE_S(IMP_PF_INJECTION_DISTANCE6_EL0);
|
||||||
SYSREG_WRITE_S(IMP_PF_INJECTION_DISTANCE7_EL0);
|
SYSREG_WRITE_S(IMP_PF_INJECTION_DISTANCE7_EL0);
|
||||||
|
SYSREG_WRITE_S(IMP_PF_PMUSERENR_EL0);
|
||||||
SYSREG_WRITE_S(IMP_BARRIER_CTRL_EL1);
|
SYSREG_WRITE_S(IMP_BARRIER_CTRL_EL1);
|
||||||
SYSREG_WRITE_S(IMP_BARRIER_BST_BIT_EL1);
|
SYSREG_WRITE_S(IMP_BARRIER_BST_BIT_EL1);
|
||||||
SYSREG_WRITE_S(IMP_BARRIER_INIT_SYNC_BB0_EL1);
|
SYSREG_WRITE_S(IMP_BARRIER_INIT_SYNC_BB0_EL1);
|
||||||
@@ -1762,4 +1876,9 @@ int smp_call_func(cpu_set_t *__cpu_set, smp_func_t __func, void *__arg)
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void arch_flush_icache_all(void)
|
||||||
|
{
|
||||||
|
asm("ic ialluis");
|
||||||
|
dsb(ish);
|
||||||
|
}
|
||||||
/*** end of file ***/
|
/*** end of file ***/
|
||||||
|
|||||||
@@ -970,7 +970,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
|
|||||||
#ifdef CONFIG_ARM64_SVE
|
#ifdef CONFIG_ARM64_SVE
|
||||||
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SVE),
|
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SVE),
|
||||||
#endif
|
#endif
|
||||||
{},
|
{ 0 },
|
||||||
};
|
};
|
||||||
|
|
||||||
/* @ref.impl arch/arm64/kernel/cpufeature.c */
|
/* @ref.impl arch/arm64/kernel/cpufeature.c */
|
||||||
|
|||||||
@@ -10,5 +10,5 @@ struct cpu_info cpu_table[] = {
|
|||||||
.cpu_name = "AArch64 Processor",
|
.cpu_name = "AArch64 Processor",
|
||||||
.cpu_setup = __cpu_setup,
|
.cpu_setup = __cpu_setup,
|
||||||
},
|
},
|
||||||
{ /* Empty */ },
|
{ 0 },
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
#include <cputype.h>
|
#include <cputype.h>
|
||||||
#include <irqflags.h>
|
#include <irqflags.h>
|
||||||
#include <ihk/context.h>
|
#include <ihk/context.h>
|
||||||
#include <ihk/debug.h>
|
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <debug-monitors.h>
|
#include <debug-monitors.h>
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
/* fpsimd.c COPYRIGHT FUJITSU LIMITED 2016-2018 */
|
/* fpsimd.c COPYRIGHT FUJITSU LIMITED 2016-2019 */
|
||||||
#include <thread_info.h>
|
#include <thread_info.h>
|
||||||
#include <fpsimd.h>
|
#include <fpsimd.h>
|
||||||
#include <cpuinfo.h>
|
#include <cpuinfo.h>
|
||||||
@@ -9,8 +9,9 @@
|
|||||||
#include <prctl.h>
|
#include <prctl.h>
|
||||||
#include <cpufeature.h>
|
#include <cpufeature.h>
|
||||||
#include <kmalloc.h>
|
#include <kmalloc.h>
|
||||||
#include <debug.h>
|
#include <ihk/debug.h>
|
||||||
#include <process.h>
|
#include <process.h>
|
||||||
|
#include <bitmap.h>
|
||||||
|
|
||||||
//#define DEBUG_PRINT_FPSIMD
|
//#define DEBUG_PRINT_FPSIMD
|
||||||
|
|
||||||
@@ -21,11 +22,87 @@
|
|||||||
|
|
||||||
#ifdef CONFIG_ARM64_SVE
|
#ifdef CONFIG_ARM64_SVE
|
||||||
|
|
||||||
|
/* Set of available vector lengths, as vq_to_bit(vq): */
|
||||||
|
static DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
|
||||||
|
|
||||||
/* Maximum supported vector length across all CPUs (initially poisoned) */
|
/* Maximum supported vector length across all CPUs (initially poisoned) */
|
||||||
int sve_max_vl = -1;
|
int sve_max_vl = -1;
|
||||||
|
|
||||||
/* Default VL for tasks that don't set it explicitly: */
|
/* Default VL for tasks that don't set it explicitly: */
|
||||||
int sve_default_vl = -1;
|
int sve_default_vl = -1;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Helpers to translate bit indices in sve_vq_map to VQ values (and
|
||||||
|
* vice versa). This allows find_next_bit() to be used to find the
|
||||||
|
* _maximum_ VQ not exceeding a certain value.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static unsigned int vq_to_bit(unsigned int vq)
|
||||||
|
{
|
||||||
|
return SVE_VQ_MAX - vq;
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned int bit_to_vq(unsigned int bit)
|
||||||
|
{
|
||||||
|
if (bit >= SVE_VQ_MAX) {
|
||||||
|
bit = SVE_VQ_MAX - 1;
|
||||||
|
}
|
||||||
|
return SVE_VQ_MAX - bit;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* All vector length selection from userspace comes through here.
|
||||||
|
* We're on a slow path, so some sanity-checks are included.
|
||||||
|
* If things go wrong there's a bug somewhere, but try to fall back to a
|
||||||
|
* safe choice.
|
||||||
|
*/
|
||||||
|
static unsigned int find_supported_vector_length(unsigned int vl)
|
||||||
|
{
|
||||||
|
int bit;
|
||||||
|
int max_vl = sve_max_vl;
|
||||||
|
|
||||||
|
if (!sve_vl_valid(vl)) {
|
||||||
|
vl = SVE_VL_MIN;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!sve_vl_valid(max_vl)) {
|
||||||
|
max_vl = SVE_VL_MIN;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vl > max_vl) {
|
||||||
|
vl = max_vl;
|
||||||
|
}
|
||||||
|
|
||||||
|
bit = find_next_bit(sve_vq_map, SVE_VQ_MAX,
|
||||||
|
vq_to_bit(sve_vq_from_vl(vl)));
|
||||||
|
return sve_vl_from_vq(bit_to_vq(bit));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX))
|
||||||
|
{
|
||||||
|
unsigned int vq, vl;
|
||||||
|
unsigned long zcr;
|
||||||
|
|
||||||
|
bitmap_zero(map, SVE_VQ_MAX);
|
||||||
|
|
||||||
|
zcr = ZCR_EL1_LEN_MASK;
|
||||||
|
zcr = read_sysreg_s(SYS_ZCR_EL1) & ~zcr;
|
||||||
|
|
||||||
|
for (vq = SVE_VQ_MAX; vq >= SVE_VQ_MIN; --vq) {
|
||||||
|
/* self-syncing */
|
||||||
|
write_sysreg_s(zcr | (vq - 1), SYS_ZCR_EL1);
|
||||||
|
vl = sve_get_vl();
|
||||||
|
/* skip intervening lengths */
|
||||||
|
vq = sve_vq_from_vl(vl);
|
||||||
|
set_bit(vq_to_bit(vq), map);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void sve_init_vq_map(void)
|
||||||
|
{
|
||||||
|
sve_probe_vqs(sve_vq_map);
|
||||||
|
}
|
||||||
|
|
||||||
size_t sve_state_size(struct thread const *thread)
|
size_t sve_state_size(struct thread const *thread)
|
||||||
{
|
{
|
||||||
unsigned int vl = thread->ctx.thread->sve_vl;
|
unsigned int vl = thread->ctx.thread->sve_vl;
|
||||||
@@ -42,17 +119,19 @@ void sve_free(struct thread *thread)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void sve_alloc(struct thread *thread)
|
int sve_alloc(struct thread *thread)
|
||||||
{
|
{
|
||||||
if (thread->ctx.thread->sve_state) {
|
if (thread->ctx.thread->sve_state) {
|
||||||
return;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
thread->ctx.thread->sve_state =
|
thread->ctx.thread->sve_state =
|
||||||
kmalloc(sve_state_size(thread), IHK_MC_AP_NOWAIT);
|
kmalloc(sve_state_size(thread), IHK_MC_AP_NOWAIT);
|
||||||
BUG_ON(!thread->ctx.thread->sve_state);
|
if (thread->ctx.thread->sve_state == NULL) {
|
||||||
|
return -ENOMEM;
|
||||||
|
}
|
||||||
memset(thread->ctx.thread->sve_state, 0, sve_state_size(thread));
|
memset(thread->ctx.thread->sve_state, 0, sve_state_size(thread));
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int get_nr_threads(struct process *proc)
|
static int get_nr_threads(struct process *proc)
|
||||||
@@ -75,19 +154,7 @@ int sve_set_vector_length(struct thread *thread,
|
|||||||
{
|
{
|
||||||
struct thread_info *ti = thread->ctx.thread;
|
struct thread_info *ti = thread->ctx.thread;
|
||||||
|
|
||||||
BUG_ON(thread == cpu_local_var(current) && cpu_local_var(no_preempt) == 0);
|
if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT |
|
||||||
|
|
||||||
/*
|
|
||||||
* To avoid accidents, forbid setting for individual threads of a
|
|
||||||
* multithreaded process. User code that knows what it's doing can
|
|
||||||
* pass PR_SVE_SET_VL_THREAD to override this restriction:
|
|
||||||
*/
|
|
||||||
if (!(flags & PR_SVE_SET_VL_THREAD) && get_nr_threads(thread->proc) != 1) {
|
|
||||||
return -EINVAL;
|
|
||||||
}
|
|
||||||
flags &= ~(unsigned long)PR_SVE_SET_VL_THREAD;
|
|
||||||
|
|
||||||
if (flags & ~(unsigned long)(PR_SVE_SET_VL_INHERIT |
|
|
||||||
PR_SVE_SET_VL_ONEXEC)) {
|
PR_SVE_SET_VL_ONEXEC)) {
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
@@ -96,13 +163,19 @@ int sve_set_vector_length(struct thread *thread,
|
|||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (vl > sve_max_vl) {
|
/*
|
||||||
BUG_ON(!sve_vl_valid(sve_max_vl));
|
* Clamp to the maximum vector length that VL-agnostic SVE code can
|
||||||
vl = sve_max_vl;
|
* work with. A flag may be assigned in the future to allow setting
|
||||||
|
* of larger vector lengths without confusing older software.
|
||||||
|
*/
|
||||||
|
if (vl > SVE_VL_ARCH_MAX) {
|
||||||
|
vl = SVE_VL_ARCH_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flags & (PR_SVE_SET_VL_ONEXEC |
|
vl = find_supported_vector_length(vl);
|
||||||
PR_SVE_SET_VL_INHERIT)) {
|
|
||||||
|
if (flags & (PR_SVE_VL_INHERIT |
|
||||||
|
PR_SVE_SET_VL_ONEXEC)) {
|
||||||
ti->sve_vl_onexec = vl;
|
ti->sve_vl_onexec = vl;
|
||||||
} else {
|
} else {
|
||||||
/* Reset VL to system default on next exec: */
|
/* Reset VL to system default on next exec: */
|
||||||
@@ -114,39 +187,42 @@ int sve_set_vector_length(struct thread *thread,
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (vl != ti->sve_vl) {
|
if (vl == ti->sve_vl) {
|
||||||
if ((elf_hwcap & HWCAP_SVE)) {
|
goto out;
|
||||||
fp_regs_struct fp_regs;
|
}
|
||||||
memset(&fp_regs, 0, sizeof(fp_regs));
|
|
||||||
|
|
||||||
/* for self at prctl syscall */
|
if ((elf_hwcap & HWCAP_SVE)) {
|
||||||
if (thread == cpu_local_var(current)) {
|
fp_regs_struct fp_regs;
|
||||||
save_fp_regs(thread);
|
|
||||||
clear_fp_regs();
|
|
||||||
thread_sve_to_fpsimd(thread, &fp_regs);
|
|
||||||
sve_free(thread);
|
|
||||||
|
|
||||||
ti->sve_vl = vl;
|
memset(&fp_regs, 0, sizeof(fp_regs));
|
||||||
|
|
||||||
sve_alloc(thread);
|
/* for self at prctl syscall */
|
||||||
thread_fpsimd_to_sve(thread, &fp_regs);
|
if (thread == cpu_local_var(current)) {
|
||||||
restore_fp_regs(thread);
|
save_fp_regs(thread);
|
||||||
/* for target thread at ptrace */
|
clear_fp_regs();
|
||||||
} else {
|
thread_sve_to_fpsimd(thread, &fp_regs);
|
||||||
thread_sve_to_fpsimd(thread, &fp_regs);
|
sve_free(thread);
|
||||||
sve_free(thread);
|
|
||||||
|
|
||||||
ti->sve_vl = vl;
|
ti->sve_vl = vl;
|
||||||
|
|
||||||
sve_alloc(thread);
|
sve_alloc(thread);
|
||||||
thread_fpsimd_to_sve(thread, &fp_regs);
|
thread_fpsimd_to_sve(thread, &fp_regs);
|
||||||
}
|
restore_fp_regs(thread);
|
||||||
|
/* for target thread at ptrace */
|
||||||
|
} else {
|
||||||
|
thread_sve_to_fpsimd(thread, &fp_regs);
|
||||||
|
sve_free(thread);
|
||||||
|
|
||||||
|
ti->sve_vl = vl;
|
||||||
|
|
||||||
|
sve_alloc(thread);
|
||||||
|
thread_fpsimd_to_sve(thread, &fp_regs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ti->sve_vl = vl;
|
ti->sve_vl = vl;
|
||||||
|
|
||||||
out:
|
out:
|
||||||
ti->sve_flags = flags & PR_SVE_SET_VL_INHERIT;
|
ti->sve_flags = flags & PR_SVE_VL_INHERIT;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -156,44 +232,53 @@ out:
|
|||||||
* Encode the current vector length and flags for return.
|
* Encode the current vector length and flags for return.
|
||||||
* This is only required for prctl(): ptrace has separate fields
|
* This is only required for prctl(): ptrace has separate fields
|
||||||
*/
|
*/
|
||||||
static int sve_prctl_status(const struct thread_info *ti)
|
static int sve_prctl_status(unsigned long flags)
|
||||||
{
|
{
|
||||||
int ret = ti->sve_vl;
|
int ret;
|
||||||
|
struct thread_info *ti = cpu_local_var(current)->ctx.thread;
|
||||||
|
|
||||||
ret |= ti->sve_flags << 16;
|
if (flags & PR_SVE_SET_VL_ONEXEC) {
|
||||||
|
ret = ti->sve_vl_onexec;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
ret = ti->sve_vl;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ti->sve_flags & PR_SVE_VL_INHERIT) {
|
||||||
|
ret |= PR_SVE_VL_INHERIT;
|
||||||
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* @ref.impl arch/arm64/kernel/fpsimd.c::sve_set_task_vl */
|
/* @ref.impl arch/arm64/kernel/fpsimd.c::sve_set_task_vl */
|
||||||
int sve_set_thread_vl(struct thread *thread, const unsigned long vector_length,
|
int sve_set_thread_vl(unsigned long arg)
|
||||||
const unsigned long flags)
|
|
||||||
{
|
{
|
||||||
|
unsigned long vl, flags;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
if (!(elf_hwcap & HWCAP_SVE)) {
|
vl = arg & PR_SVE_VL_LEN_MASK;
|
||||||
|
flags = arg & ~vl;
|
||||||
|
|
||||||
|
/* Instead of system_supports_sve() */
|
||||||
|
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
BUG_ON(thread != cpu_local_var(current));
|
ret = sve_set_vector_length(cpu_local_var(current), vl, flags);
|
||||||
|
|
||||||
preempt_disable();
|
|
||||||
ret = sve_set_vector_length(thread, vector_length, flags);
|
|
||||||
preempt_enable();
|
|
||||||
|
|
||||||
if (ret) {
|
if (ret) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
return sve_prctl_status(thread->ctx.thread);
|
return sve_prctl_status(flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* @ref.impl arch/arm64/kernel/fpsimd.c::sve_get_ti_vl */
|
/* @ref.impl arch/arm64/kernel/fpsimd.c::sve_get_ti_vl */
|
||||||
int sve_get_thread_vl(const struct thread *thread)
|
int sve_get_thread_vl(void)
|
||||||
{
|
{
|
||||||
if (!(elf_hwcap & HWCAP_SVE)) {
|
/* Instead of system_supports_sve() */
|
||||||
|
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
return sve_prctl_status(thread->ctx.thread);
|
return sve_prctl_status(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void do_sve_acc(unsigned int esr, struct pt_regs *regs)
|
void do_sve_acc(unsigned int esr, struct pt_regs *regs)
|
||||||
@@ -203,25 +288,48 @@ void do_sve_acc(unsigned int esr, struct pt_regs *regs)
|
|||||||
panic("");
|
panic("");
|
||||||
}
|
}
|
||||||
|
|
||||||
void init_sve_vl(void)
|
void sve_setup(void)
|
||||||
{
|
{
|
||||||
extern unsigned long ihk_param_default_vl;
|
extern unsigned long ihk_param_default_vl;
|
||||||
uint64_t zcr;
|
uint64_t zcr;
|
||||||
|
|
||||||
|
/* Instead of system_supports_sve() */
|
||||||
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
zcr = read_system_reg(SYS_ZCR_EL1);
|
/* init sve_vq_map bitmap */
|
||||||
BUG_ON(((zcr & ZCR_EL1_LEN_MASK) + 1) * 16 > sve_max_vl);
|
sve_init_vq_map();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The SVE architecture mandates support for 128-bit vectors,
|
||||||
|
* so sve_vq_map must have at least SVE_VQ_MIN set.
|
||||||
|
* If something went wrong, at least try to patch it up:
|
||||||
|
*/
|
||||||
|
if (!test_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map)) {
|
||||||
|
set_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
zcr = read_system_reg(SYS_ZCR_EL1);
|
||||||
|
sve_max_vl = sve_vl_from_vq((zcr & ZCR_EL1_LEN_MASK) + 1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Sanity-check that the max VL we determined through CPU features
|
||||||
|
* corresponds properly to sve_vq_map. If not, do our best:
|
||||||
|
*/
|
||||||
|
if (sve_max_vl != find_supported_vector_length(sve_max_vl)) {
|
||||||
|
sve_max_vl = find_supported_vector_length(sve_max_vl);
|
||||||
|
}
|
||||||
|
|
||||||
sve_max_vl = ((zcr & ZCR_EL1_LEN_MASK) + 1) * 16;
|
|
||||||
sve_default_vl = ihk_param_default_vl;
|
sve_default_vl = ihk_param_default_vl;
|
||||||
|
|
||||||
if (sve_default_vl == 0) {
|
if (ihk_param_default_vl !=
|
||||||
kprintf("SVE: Getting default VL = 0 from HOST-Linux.\n");
|
find_supported_vector_length(ihk_param_default_vl)) {
|
||||||
sve_default_vl = sve_max_vl > 64 ? 64 : sve_max_vl;
|
kprintf("SVE: Getting unsupported default VL = %d "
|
||||||
kprintf("SVE: Using default vl(%d byte).\n", sve_default_vl);
|
"from HOST-Linux.\n", sve_default_vl);
|
||||||
|
sve_default_vl = find_supported_vector_length(64);
|
||||||
|
kprintf("SVE: Using default vl(%d byte).\n",
|
||||||
|
sve_default_vl);
|
||||||
}
|
}
|
||||||
|
|
||||||
kprintf("SVE: maximum available vector length %u bytes per vector\n",
|
kprintf("SVE: maximum available vector length %u bytes per vector\n",
|
||||||
@@ -232,7 +340,7 @@ void init_sve_vl(void)
|
|||||||
|
|
||||||
#else /* CONFIG_ARM64_SVE */
|
#else /* CONFIG_ARM64_SVE */
|
||||||
|
|
||||||
void init_sve_vl(void)
|
void sve_setup(void)
|
||||||
{
|
{
|
||||||
/* nothing to do. */
|
/* nothing to do. */
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,7 +10,7 @@
|
|||||||
#include <smp.h>
|
#include <smp.h>
|
||||||
#include <arm-gic-v3.h>
|
#include <arm-gic-v3.h>
|
||||||
|
|
||||||
#define KERNEL_RAM_VADDR MAP_KERNEL_START
|
/* KERNEL_RAM_VADDR is defined by cmake */
|
||||||
|
|
||||||
//#ifndef CONFIG_SMP
|
//#ifndef CONFIG_SMP
|
||||||
//# define PTE_FLAGS PTE_TYPE_PAGE | PTE_AF
|
//# define PTE_FLAGS PTE_TYPE_PAGE | PTE_AF
|
||||||
|
|||||||
@@ -255,90 +255,6 @@ static void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
|
|||||||
cpu_restore_interrupt(flags);
|
cpu_restore_interrupt(flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* An implementation of the Mellor-Crummey Scott (MCS) lock */
|
|
||||||
typedef struct mcs_lock_node {
|
|
||||||
unsigned long locked;
|
|
||||||
struct mcs_lock_node *next;
|
|
||||||
unsigned long irqsave;
|
|
||||||
#ifndef ENABLE_UBSAN
|
|
||||||
} __aligned(64) mcs_lock_node_t;
|
|
||||||
#else
|
|
||||||
} mcs_lock_node_t;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef mcs_lock_node_t mcs_lock_t;
|
|
||||||
|
|
||||||
static void mcs_lock_init(struct mcs_lock_node *node)
|
|
||||||
{
|
|
||||||
node->locked = 0;
|
|
||||||
node->next = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void __mcs_lock_lock(struct mcs_lock_node *lock,
|
|
||||||
struct mcs_lock_node *node)
|
|
||||||
{
|
|
||||||
struct mcs_lock_node *pred;
|
|
||||||
|
|
||||||
node->next = NULL;
|
|
||||||
node->locked = 0;
|
|
||||||
pred = xchg8(&(lock->next), node);
|
|
||||||
|
|
||||||
if (pred) {
|
|
||||||
node->locked = 1;
|
|
||||||
pred->next = node;
|
|
||||||
while (node->locked != 0) {
|
|
||||||
cpu_pause();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void __mcs_lock_unlock(struct mcs_lock_node *lock,
|
|
||||||
struct mcs_lock_node *node)
|
|
||||||
{
|
|
||||||
if (node->next == NULL) {
|
|
||||||
struct mcs_lock_node *old = atomic_cmpxchg8(&(lock->next), node, 0);
|
|
||||||
|
|
||||||
if (old == node) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
while (node->next == NULL) {
|
|
||||||
cpu_pause();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
node->next->locked = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mcs_lock_lock_noirq(struct mcs_lock_node *lock,
|
|
||||||
struct mcs_lock_node *node)
|
|
||||||
{
|
|
||||||
preempt_disable();
|
|
||||||
__mcs_lock_lock(lock, node);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mcs_lock_unlock_noirq(struct mcs_lock_node *lock,
|
|
||||||
struct mcs_lock_node *node)
|
|
||||||
{
|
|
||||||
__mcs_lock_unlock(lock, node);
|
|
||||||
preempt_enable();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mcs_lock_lock(struct mcs_lock_node *lock,
|
|
||||||
struct mcs_lock_node *node)
|
|
||||||
{
|
|
||||||
node->irqsave = cpu_disable_interrupt_save();
|
|
||||||
mcs_lock_lock_noirq(lock, node);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mcs_lock_unlock(struct mcs_lock_node *lock,
|
|
||||||
struct mcs_lock_node *node)
|
|
||||||
{
|
|
||||||
mcs_lock_unlock_noirq(lock, node);
|
|
||||||
cpu_restore_interrupt(node->irqsave);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#define SPINLOCK_IN_MCS_RWLOCK
|
#define SPINLOCK_IN_MCS_RWLOCK
|
||||||
|
|
||||||
// reader/writer lock
|
// reader/writer lock
|
||||||
@@ -743,5 +659,102 @@ static inline int irqflags_can_interrupt(unsigned long flags)
|
|||||||
}
|
}
|
||||||
#endif /* CONFIG_HAS_NMI */
|
#endif /* CONFIG_HAS_NMI */
|
||||||
|
|
||||||
|
struct ihk_rwlock {
|
||||||
|
unsigned int lock;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline void ihk_mc_rwlock_init(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
rw->lock = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void ihk_mc_read_lock(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
unsigned int tmp, tmp2;
|
||||||
|
|
||||||
|
asm volatile(
|
||||||
|
" sevl\n"
|
||||||
|
"1: wfe\n"
|
||||||
|
"2: ldaxr %w0, %2\n"
|
||||||
|
" add %w0, %w0, #1\n"
|
||||||
|
" tbnz %w0, #31, 1b\n"
|
||||||
|
" stxr %w1, %w0, %2\n"
|
||||||
|
" cbnz %w1, 2b\n"
|
||||||
|
: "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock)
|
||||||
|
:
|
||||||
|
: "cc", "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int ihk_mc_read_trylock(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
unsigned int tmp, tmp2 = 1;
|
||||||
|
|
||||||
|
asm volatile(
|
||||||
|
" ldaxr %w0, %2\n"
|
||||||
|
" add %w0, %w0, #1\n"
|
||||||
|
" tbnz %w0, #31, 1f\n"
|
||||||
|
" stxr %w1, %w0, %2\n"
|
||||||
|
"1:\n"
|
||||||
|
: "=&r" (tmp), "+r" (tmp2), "+Q" (rw->lock)
|
||||||
|
:
|
||||||
|
: "cc", "memory");
|
||||||
|
|
||||||
|
return !tmp2;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void ihk_mc_read_unlock(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
unsigned int tmp, tmp2;
|
||||||
|
|
||||||
|
asm volatile(
|
||||||
|
"1: ldxr %w0, %2\n"
|
||||||
|
" sub %w0, %w0, #1\n"
|
||||||
|
" stlxr %w1, %w0, %2\n"
|
||||||
|
" cbnz %w1, 1b\n"
|
||||||
|
: "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock)
|
||||||
|
:
|
||||||
|
: "cc", "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void ihk_mc_write_lock(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
unsigned int tmp;
|
||||||
|
|
||||||
|
asm volatile(
|
||||||
|
" sevl\n"
|
||||||
|
"1: wfe\n"
|
||||||
|
"2: ldaxr %w0, %1\n"
|
||||||
|
" cbnz %w0, 1b\n"
|
||||||
|
" stxr %w0, %w2, %1\n"
|
||||||
|
" cbnz %w0, 2b\n"
|
||||||
|
: "=&r" (tmp), "+Q" (rw->lock)
|
||||||
|
: "r" (0x80000000)
|
||||||
|
: "cc", "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int ihk_mc_write_trylock(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
unsigned int tmp;
|
||||||
|
|
||||||
|
asm volatile(
|
||||||
|
" ldaxr %w0, %1\n"
|
||||||
|
" cbnz %w0, 1f\n"
|
||||||
|
" stxr %w0, %w2, %1\n"
|
||||||
|
"1:\n"
|
||||||
|
: "=&r" (tmp), "+Q" (rw->lock)
|
||||||
|
: "r" (0x80000000)
|
||||||
|
: "cc", "memory");
|
||||||
|
|
||||||
|
return !tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void ihk_mc_write_unlock(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
asm volatile(
|
||||||
|
" stlr %w1, %0\n"
|
||||||
|
: "=Q" (rw->lock) : "r" (0) : "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ihk_mc_read_can_lock(rw) ((rw)->lock < 0x80000000)
|
||||||
|
#define ihk_mc_write_can_lock(rw) ((rw)->lock == 0)
|
||||||
#endif /* !__HEADER_ARM64_COMMON_ARCH_LOCK_H */
|
#endif /* !__HEADER_ARM64_COMMON_ARCH_LOCK_H */
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ void panic(const char *);
|
|||||||
*/
|
*/
|
||||||
/* early alloc area address */
|
/* early alloc area address */
|
||||||
/* START:_end, SIZE:512 pages */
|
/* START:_end, SIZE:512 pages */
|
||||||
#define MAP_EARLY_ALLOC_SHIFT 9
|
#define MAP_EARLY_ALLOC_SHIFT 5
|
||||||
#define MAP_EARLY_ALLOC_SIZE (UL(1) << (PAGE_SHIFT + MAP_EARLY_ALLOC_SHIFT))
|
#define MAP_EARLY_ALLOC_SIZE (UL(1) << (PAGE_SHIFT + MAP_EARLY_ALLOC_SHIFT))
|
||||||
|
|
||||||
#ifndef __ASSEMBLY__
|
#ifndef __ASSEMBLY__
|
||||||
@@ -55,7 +55,11 @@ extern char _end[];
|
|||||||
# define MAP_BOOT_PARAM_END (MAP_BOOT_PARAM + MAP_BOOT_PARAM_SIZE)
|
# define MAP_BOOT_PARAM_END (MAP_BOOT_PARAM + MAP_BOOT_PARAM_SIZE)
|
||||||
#endif /* !__ASSEMBLY__ */
|
#endif /* !__ASSEMBLY__ */
|
||||||
|
|
||||||
#if (VA_BITS == 39 && GRANULE_SIZE == _SZ4KB)
|
/*
|
||||||
|
* MAP_KERNEL_START is HOST MODULES_END - 8MiB.
|
||||||
|
* It's defined by cmake.
|
||||||
|
*/
|
||||||
|
#if (VA_BITS == 39 && GRANULE_SIZE == _SZ4KB) /* ARM64_MEMORY_LAYOUT=1 */
|
||||||
#
|
#
|
||||||
# define LD_TASK_UNMAPPED_BASE UL(0x0000000400000000)
|
# define LD_TASK_UNMAPPED_BASE UL(0x0000000400000000)
|
||||||
# define TASK_UNMAPPED_BASE UL(0x0000000800000000)
|
# define TASK_UNMAPPED_BASE UL(0x0000000800000000)
|
||||||
@@ -64,9 +68,8 @@ extern char _end[];
|
|||||||
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
||||||
# define MAP_FIXED_START UL(0xffffffbffbdfd000)
|
# define MAP_FIXED_START UL(0xffffffbffbdfd000)
|
||||||
# define MAP_ST_START UL(0xffffffc000000000)
|
# define MAP_ST_START UL(0xffffffc000000000)
|
||||||
# define MAP_KERNEL_START UL(0xffffffffff800000)
|
|
||||||
#
|
#
|
||||||
#elif (VA_BITS == 42 && GRANULE_SIZE == _SZ64KB)
|
#elif (VA_BITS == 42 && GRANULE_SIZE == _SZ64KB) /* ARM64_MEMORY_LAYOUT=3 */
|
||||||
#
|
#
|
||||||
# define LD_TASK_UNMAPPED_BASE UL(0x0000002000000000)
|
# define LD_TASK_UNMAPPED_BASE UL(0x0000002000000000)
|
||||||
# define TASK_UNMAPPED_BASE UL(0x0000004000000000)
|
# define TASK_UNMAPPED_BASE UL(0x0000004000000000)
|
||||||
@@ -75,9 +78,8 @@ extern char _end[];
|
|||||||
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
||||||
# define MAP_FIXED_START UL(0xfffffdfffbdd0000)
|
# define MAP_FIXED_START UL(0xfffffdfffbdd0000)
|
||||||
# define MAP_ST_START UL(0xfffffe0000000000)
|
# define MAP_ST_START UL(0xfffffe0000000000)
|
||||||
# define MAP_KERNEL_START UL(0xffffffffe0000000)
|
|
||||||
#
|
#
|
||||||
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ4KB)
|
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ4KB) /* ARM64_MEMORY_LAYOUT=2 */
|
||||||
#
|
#
|
||||||
# define LD_TASK_UNMAPPED_BASE UL(0x0000080000000000)
|
# define LD_TASK_UNMAPPED_BASE UL(0x0000080000000000)
|
||||||
# define TASK_UNMAPPED_BASE UL(0x0000100000000000)
|
# define TASK_UNMAPPED_BASE UL(0x0000100000000000)
|
||||||
@@ -86,9 +88,8 @@ extern char _end[];
|
|||||||
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
||||||
# define MAP_FIXED_START UL(0xffff7ffffbdfd000)
|
# define MAP_FIXED_START UL(0xffff7ffffbdfd000)
|
||||||
# define MAP_ST_START UL(0xffff800000000000)
|
# define MAP_ST_START UL(0xffff800000000000)
|
||||||
# define MAP_KERNEL_START UL(0xffffffffff800000)
|
|
||||||
#
|
#
|
||||||
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ64KB)
|
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ64KB) /* ARM64_MEMORY_LAYOUT=4 */
|
||||||
#
|
#
|
||||||
# define LD_TASK_UNMAPPED_BASE UL(0x0000080000000000)
|
# define LD_TASK_UNMAPPED_BASE UL(0x0000080000000000)
|
||||||
# define TASK_UNMAPPED_BASE UL(0x0000100000000000)
|
# define TASK_UNMAPPED_BASE UL(0x0000100000000000)
|
||||||
@@ -97,7 +98,6 @@ extern char _end[];
|
|||||||
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
||||||
# define MAP_FIXED_START UL(0xffff7ffffbdd0000)
|
# define MAP_FIXED_START UL(0xffff7ffffbdd0000)
|
||||||
# define MAP_ST_START UL(0xffff800000000000)
|
# define MAP_ST_START UL(0xffff800000000000)
|
||||||
# define MAP_KERNEL_START UL(0xffffffffe0000000)
|
|
||||||
#
|
#
|
||||||
#else
|
#else
|
||||||
# error address space is not defined.
|
# error address space is not defined.
|
||||||
@@ -583,6 +583,40 @@ static inline int pgsize_to_tbllv(size_t pgsize)
|
|||||||
return level;
|
return level;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int pgsize_to_pgshift(size_t pgsize)
|
||||||
|
{
|
||||||
|
/* We need to use if instead of switch because
|
||||||
|
* sometimes PTLX_CONT_SIZE == PTLX_SIZE
|
||||||
|
*/
|
||||||
|
if (pgsize == PTL4_CONT_SIZE) {
|
||||||
|
if (CONFIG_ARM64_PGTABLE_LEVELS > 3) {
|
||||||
|
return PTL4_CONT_SHIFT;
|
||||||
|
}
|
||||||
|
} else if (pgsize == PTL4_SIZE) {
|
||||||
|
if (CONFIG_ARM64_PGTABLE_LEVELS > 3) {
|
||||||
|
return PTL4_SHIFT;
|
||||||
|
}
|
||||||
|
} else if (pgsize == PTL3_CONT_SIZE) {
|
||||||
|
if (CONFIG_ARM64_PGTABLE_LEVELS > 2) {
|
||||||
|
return PTL3_CONT_SHIFT;
|
||||||
|
}
|
||||||
|
} else if (pgsize == PTL3_SIZE) {
|
||||||
|
if (CONFIG_ARM64_PGTABLE_LEVELS > 2) {
|
||||||
|
return PTL3_SHIFT;
|
||||||
|
}
|
||||||
|
} else if (pgsize == PTL2_CONT_SIZE) {
|
||||||
|
return PTL2_CONT_SHIFT;
|
||||||
|
} else if (pgsize == PTL2_SIZE) {
|
||||||
|
return PTL2_SHIFT;
|
||||||
|
} else if (pgsize == PTL1_CONT_SIZE) {
|
||||||
|
return PTL1_CONT_SHIFT;
|
||||||
|
} else if (pgsize == PTL1_SIZE) {
|
||||||
|
return PTL1_SHIFT;
|
||||||
|
}
|
||||||
|
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
static inline size_t tbllv_to_pgsize(int level)
|
static inline size_t tbllv_to_pgsize(int level)
|
||||||
{
|
{
|
||||||
size_t pgsize = 0;
|
size_t pgsize = 0;
|
||||||
|
|||||||
@@ -20,17 +20,21 @@ struct arm_pmu {
|
|||||||
void (*reset)(void*);
|
void (*reset)(void*);
|
||||||
int (*enable_pmu)(void);
|
int (*enable_pmu)(void);
|
||||||
void (*disable_pmu)(void);
|
void (*disable_pmu)(void);
|
||||||
int (*enable_counter)(int);
|
int (*enable_counter)(unsigned long counter_mask);
|
||||||
int (*disable_counter)(int);
|
int (*disable_counter)(unsigned long counter_mask);
|
||||||
int (*enable_intens)(int);
|
int (*enable_intens)(unsigned long counter_mask);
|
||||||
int (*disable_intens)(int);
|
int (*disable_intens)(unsigned long counter_mask);
|
||||||
int (*set_event_filter)(unsigned long*, int);
|
int (*set_event_filter)(unsigned long*, int);
|
||||||
void (*write_evtype)(int, uint32_t);
|
void (*write_evtype)(int, uint32_t);
|
||||||
int (*get_event_idx)(int num_events, unsigned long used_mask,
|
int (*get_event_idx)(int num_events, unsigned long used_mask,
|
||||||
unsigned long config);
|
unsigned long config);
|
||||||
int (*map_event)(uint32_t, uint64_t);
|
int (*map_event)(uint32_t, uint64_t);
|
||||||
|
int (*map_hw_event)(uint64_t config);
|
||||||
|
int (*map_cache_event)(uint64_t config);
|
||||||
|
int (*map_raw_event)(uint64_t config);
|
||||||
void (*enable_user_access_pmu_regs)(void);
|
void (*enable_user_access_pmu_regs)(void);
|
||||||
void (*disable_user_access_pmu_regs)(void);
|
void (*disable_user_access_pmu_regs)(void);
|
||||||
|
int (*counter_mask_valid)(unsigned long counter_mask);
|
||||||
struct per_cpu_arm_pmu *per_cpu;
|
struct per_cpu_arm_pmu *per_cpu;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -102,4 +102,6 @@ static inline void cpu_disable_nmi(void)
|
|||||||
|
|
||||||
#endif /* __ASSEMBLY__ */
|
#endif /* __ASSEMBLY__ */
|
||||||
|
|
||||||
|
void arch_flush_icache_all(void);
|
||||||
|
|
||||||
#endif /* !__HEADER_ARM64_ARCH_CPU_H */
|
#endif /* !__HEADER_ARM64_ARCH_CPU_H */
|
||||||
|
|||||||
@@ -1,60 +0,0 @@
|
|||||||
#ifndef ARCH_RUSAGE_H_INCLUDED
|
|
||||||
#define ARCH_RUSAGE_H_INCLUDED
|
|
||||||
|
|
||||||
#define DEBUG_RUSAGE
|
|
||||||
|
|
||||||
#define IHK_OS_PGSIZE_4KB 0
|
|
||||||
#define IHK_OS_PGSIZE_2MB 1
|
|
||||||
#define IHK_OS_PGSIZE_1GB 2
|
|
||||||
|
|
||||||
extern struct ihk_os_monitor *monitor;
|
|
||||||
|
|
||||||
extern int sprintf(char * buf, const char *fmt, ...);
|
|
||||||
|
|
||||||
#define DEBUG_ARCH_RUSAGE
|
|
||||||
#ifdef DEBUG_ARCH_RUSAGE
|
|
||||||
#define dprintf(...) \
|
|
||||||
do { \
|
|
||||||
char msg[1024]; \
|
|
||||||
sprintf(msg, __VA_ARGS__); \
|
|
||||||
kprintf("%s,%s", __FUNCTION__, msg); \
|
|
||||||
} while (0);
|
|
||||||
#define eprintf(...) \
|
|
||||||
do { \
|
|
||||||
char msg[1024]; \
|
|
||||||
sprintf(msg, __VA_ARGS__); \
|
|
||||||
kprintf("%s,%s", __FUNCTION__, msg); \
|
|
||||||
} while (0);
|
|
||||||
#else
|
|
||||||
#define dprintf(...) do { } while (0)
|
|
||||||
#define eprintf(...) \
|
|
||||||
do { \
|
|
||||||
char msg[1024]; \
|
|
||||||
sprintf(msg, __VA_ARGS__); \
|
|
||||||
kprintf("%s,%s", __FUNCTION__, msg); \
|
|
||||||
} while (0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static inline int rusage_pgsize_to_pgtype(size_t pgsize)
|
|
||||||
{
|
|
||||||
int ret = IHK_OS_PGSIZE_4KB;
|
|
||||||
#if 0 /* postk-TODO */
|
|
||||||
switch (pgsize) {
|
|
||||||
case PTL1_SIZE:
|
|
||||||
ret = IHK_OS_PGSIZE_4KB;
|
|
||||||
break;
|
|
||||||
case PTL2_SIZE:
|
|
||||||
ret = IHK_OS_PGSIZE_2MB;
|
|
||||||
break;
|
|
||||||
case PTL3_SIZE:
|
|
||||||
ret = IHK_OS_PGSIZE_1GB;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
eprintf("unknown pgsize=%ld\n", pgsize);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* !defined(ARCH_RUSAGE_H_INCLUDED) */
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
#ifndef ARCH_RUSAGE_H_INCLUDED
|
|
||||||
#define ARCH_RUSAGE_H_INCLUDED
|
|
||||||
|
|
||||||
#include <arch-memory.h>
|
|
||||||
|
|
||||||
#define DEBUG_RUSAGE
|
|
||||||
|
|
||||||
#define IHK_OS_PGSIZE_4KB 0
|
|
||||||
#define IHK_OS_PGSIZE_2MB 1
|
|
||||||
#define IHK_OS_PGSIZE_1GB 2
|
|
||||||
|
|
||||||
extern struct rusage_global rusage;
|
|
||||||
|
|
||||||
static inline int rusage_pgsize_to_pgtype(size_t pgsize)
|
|
||||||
{
|
|
||||||
int ret = IHK_OS_PGSIZE_4KB;
|
|
||||||
|
|
||||||
if (pgsize == PTL1_SIZE) {
|
|
||||||
ret = IHK_OS_PGSIZE_4KB;
|
|
||||||
}
|
|
||||||
else if (pgsize == PTL2_SIZE) {
|
|
||||||
ret = IHK_OS_PGSIZE_2MB;
|
|
||||||
}
|
|
||||||
else if (pgsize == PTL3_SIZE) {
|
|
||||||
ret = IHK_OS_PGSIZE_1GB;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
kprintf("%s: Error: Unknown pgsize=%ld\n", __FUNCTION__, pgsize);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* !defined(ARCH_RUSAGE_H_INCLUDED) */
|
|
||||||
@@ -8,6 +8,7 @@
|
|||||||
#define SYSCALL_HANDLED(number, name) DECLARATOR(number, name)
|
#define SYSCALL_HANDLED(number, name) DECLARATOR(number, name)
|
||||||
#define SYSCALL_DELEGATED(number, name) DECLARATOR(number, name)
|
#define SYSCALL_DELEGATED(number, name) DECLARATOR(number, name)
|
||||||
|
|
||||||
|
#include <config.h>
|
||||||
#include <syscall_list.h>
|
#include <syscall_list.h>
|
||||||
|
|
||||||
#undef DECLARATOR
|
#undef DECLARATOR
|
||||||
|
|||||||
@@ -67,21 +67,12 @@ struct arm64_cpu_capabilities {
|
|||||||
int def_scope;/* default scope */
|
int def_scope;/* default scope */
|
||||||
int (*matches)(const struct arm64_cpu_capabilities *caps, int scope);
|
int (*matches)(const struct arm64_cpu_capabilities *caps, int scope);
|
||||||
int (*enable)(void *);/* Called on all active CPUs */
|
int (*enable)(void *);/* Called on all active CPUs */
|
||||||
union {
|
uint32_t sys_reg;
|
||||||
struct {/* To be used for erratum handling only */
|
uint8_t field_pos;
|
||||||
uint32_t midr_model;
|
uint8_t min_field_value;
|
||||||
uint32_t midr_range_min, midr_range_max;
|
uint8_t hwcap_type;
|
||||||
};
|
int sign;
|
||||||
|
unsigned long hwcap;
|
||||||
struct {/* Feature register checking */
|
|
||||||
uint32_t sys_reg;
|
|
||||||
uint8_t field_pos;
|
|
||||||
uint8_t min_field_value;
|
|
||||||
uint8_t hwcap_type;
|
|
||||||
int sign;
|
|
||||||
unsigned long hwcap;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/* @ref.impl include/linux/bitops.h */
|
/* @ref.impl include/linux/bitops.h */
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
/* fpsimd.h COPYRIGHT FUJITSU LIMITED 2016-2017 */
|
/* fpsimd.h COPYRIGHT FUJITSU LIMITED 2016-2019 */
|
||||||
#ifndef __HEADER_ARM64_COMMON_FPSIMD_H
|
#ifndef __HEADER_ARM64_COMMON_FPSIMD_H
|
||||||
#define __HEADER_ARM64_COMMON_FPSIMD_H
|
#define __HEADER_ARM64_COMMON_FPSIMD_H
|
||||||
|
|
||||||
@@ -42,16 +42,19 @@ extern void thread_sve_to_fpsimd(struct thread *thread, fp_regs_struct *fp_regs)
|
|||||||
|
|
||||||
extern size_t sve_state_size(struct thread const *thread);
|
extern size_t sve_state_size(struct thread const *thread);
|
||||||
extern void sve_free(struct thread *thread);
|
extern void sve_free(struct thread *thread);
|
||||||
extern void sve_alloc(struct thread *thread);
|
extern int sve_alloc(struct thread *thread);
|
||||||
extern void sve_save_state(void *state, unsigned int *pfpsr);
|
extern void sve_save_state(void *state, unsigned int *pfpsr);
|
||||||
extern void sve_load_state(void const *state, unsigned int const *pfpsr, unsigned long vq_minus_1);
|
extern void sve_load_state(void const *state, unsigned int const *pfpsr, unsigned long vq_minus_1);
|
||||||
extern unsigned int sve_get_vl(void);
|
extern unsigned int sve_get_vl(void);
|
||||||
extern int sve_set_thread_vl(struct thread *thread, const unsigned long vector_length, const unsigned long flags);
|
extern int sve_set_thread_vl(unsigned long arg);
|
||||||
extern int sve_get_thread_vl(const struct thread *thread);
|
extern int sve_get_thread_vl(void);
|
||||||
extern int sve_set_vector_length(struct thread *thread, unsigned long vl, unsigned long flags);
|
extern int sve_set_vector_length(struct thread *thread, unsigned long vl, unsigned long flags);
|
||||||
|
|
||||||
#define SVE_SET_VL(thread, vector_length, flags) sve_set_thread_vl(thread, vector_length, flags)
|
#define SVE_SET_VL(arg) sve_set_thread_vl(arg)
|
||||||
#define SVE_GET_VL(thread) sve_get_thread_vl(thread)
|
#define SVE_GET_VL() sve_get_thread_vl()
|
||||||
|
|
||||||
|
/* Maximum VL that SVE VL-agnostic software can transparently support */
|
||||||
|
#define SVE_VL_ARCH_MAX 0x100
|
||||||
|
|
||||||
#else /* CONFIG_ARM64_SVE */
|
#else /* CONFIG_ARM64_SVE */
|
||||||
|
|
||||||
@@ -80,12 +83,12 @@ static int sve_set_vector_length(struct thread *thread, unsigned long vl, unsign
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* for prctl syscall */
|
/* for prctl syscall */
|
||||||
#define SVE_SET_VL(a,b,c) (-EINVAL)
|
#define SVE_SET_VL(a) (-EINVAL)
|
||||||
#define SVE_GET_VL(a) (-EINVAL)
|
#define SVE_GET_VL() (-EINVAL)
|
||||||
|
|
||||||
#endif /* CONFIG_ARM64_SVE */
|
#endif /* CONFIG_ARM64_SVE */
|
||||||
|
|
||||||
extern void init_sve_vl(void);
|
extern void sve_setup(void);
|
||||||
extern void fpsimd_save_state(struct fpsimd_state *state);
|
extern void fpsimd_save_state(struct fpsimd_state *state);
|
||||||
extern void fpsimd_load_state(struct fpsimd_state *state);
|
extern void fpsimd_load_state(struct fpsimd_state *state);
|
||||||
extern void thread_fpsimd_save(struct thread *thread);
|
extern void thread_fpsimd_save(struct thread *thread);
|
||||||
|
|||||||
@@ -124,7 +124,7 @@ static inline long ihk_atomic64_read(const ihk_atomic64_t *v)
|
|||||||
return *(volatile long *)&(v)->counter64;
|
return *(volatile long *)&(v)->counter64;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void ihk_atomic64_set(ihk_atomic64_t *v, int i)
|
static inline void ihk_atomic64_set(ihk_atomic64_t *v, long i)
|
||||||
{
|
{
|
||||||
v->counter64 = i;
|
v->counter64 = i;
|
||||||
}
|
}
|
||||||
@@ -147,6 +147,8 @@ static inline void ihk_atomic64_add(long i, ihk_atomic64_t *v)
|
|||||||
/* @ref.impl arch/arm64/include/asm/atomic.h::atomic64_inc */
|
/* @ref.impl arch/arm64/include/asm/atomic.h::atomic64_inc */
|
||||||
#define ihk_atomic64_inc(v) ihk_atomic64_add(1LL, (v))
|
#define ihk_atomic64_inc(v) ihk_atomic64_add(1LL, (v))
|
||||||
|
|
||||||
|
#define ihk_atomic64_cmpxchg(p, o, n) cmpxchg(&((p)->counter64), o, n)
|
||||||
|
|
||||||
/***********************************************************************
|
/***********************************************************************
|
||||||
* others
|
* others
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -29,6 +29,7 @@
|
|||||||
#define IMP_PF_INJECTION_DISTANCE5_EL0 sys_reg(3, 3, 11, 7, 5)
|
#define IMP_PF_INJECTION_DISTANCE5_EL0 sys_reg(3, 3, 11, 7, 5)
|
||||||
#define IMP_PF_INJECTION_DISTANCE6_EL0 sys_reg(3, 3, 11, 7, 6)
|
#define IMP_PF_INJECTION_DISTANCE6_EL0 sys_reg(3, 3, 11, 7, 6)
|
||||||
#define IMP_PF_INJECTION_DISTANCE7_EL0 sys_reg(3, 3, 11, 7, 7)
|
#define IMP_PF_INJECTION_DISTANCE7_EL0 sys_reg(3, 3, 11, 7, 7)
|
||||||
|
#define IMP_PF_PMUSERENR_EL0 sys_reg(3, 3, 9, 14, 0)
|
||||||
#define IMP_BARRIER_CTRL_EL1 sys_reg(3, 0, 11, 12, 0)
|
#define IMP_BARRIER_CTRL_EL1 sys_reg(3, 0, 11, 12, 0)
|
||||||
#define IMP_BARRIER_BST_BIT_EL1 sys_reg(3, 0, 11, 12, 4)
|
#define IMP_BARRIER_BST_BIT_EL1 sys_reg(3, 0, 11, 12, 4)
|
||||||
#define IMP_BARRIER_INIT_SYNC_BB0_EL1 sys_reg(3, 0, 15, 13, 0)
|
#define IMP_BARRIER_INIT_SYNC_BB0_EL1 sys_reg(3, 0, 15, 13, 0)
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
/* irq.h COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
/* irq.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||||
|
|
||||||
#ifndef __HEADER_ARM64_IRQ_H
|
#ifndef __HEADER_ARM64_IRQ_H
|
||||||
#define __HEADER_ARM64_IRQ_H
|
#define __HEADER_ARM64_IRQ_H
|
||||||
@@ -14,7 +14,8 @@
|
|||||||
#define INTRID_QUERY_FREE_MEM 2
|
#define INTRID_QUERY_FREE_MEM 2
|
||||||
#define INTRID_CPU_STOP 3
|
#define INTRID_CPU_STOP 3
|
||||||
#define INTRID_TLB_FLUSH 4
|
#define INTRID_TLB_FLUSH 4
|
||||||
#define INTRID_STACK_TRACE 6
|
#define INTRID_STACK_TRACE 5
|
||||||
|
#define INTRID_MULTI_INTR 6
|
||||||
#define INTRID_MULTI_NMI 7
|
#define INTRID_MULTI_NMI 7
|
||||||
|
|
||||||
/* use PPI interrupt number */
|
/* use PPI interrupt number */
|
||||||
@@ -29,6 +30,7 @@ extern void gic_dist_init_gicv2(unsigned long dist_base_pa, unsigned long size);
|
|||||||
extern void gic_cpu_init_gicv2(unsigned long cpu_base_pa, unsigned long size);
|
extern void gic_cpu_init_gicv2(unsigned long cpu_base_pa, unsigned long size);
|
||||||
extern void gic_enable_gicv2(void);
|
extern void gic_enable_gicv2(void);
|
||||||
extern void arm64_issue_ipi_gicv2(unsigned int cpuid, unsigned int vector);
|
extern void arm64_issue_ipi_gicv2(unsigned int cpuid, unsigned int vector);
|
||||||
|
extern void arm64_issue_host_ipi_gicv2(uint32_t cpuid, uint32_t vector);
|
||||||
extern void handle_interrupt_gicv2(struct pt_regs *regs);
|
extern void handle_interrupt_gicv2(struct pt_regs *regs);
|
||||||
|
|
||||||
/* Functions for GICv3 */
|
/* Functions for GICv3 */
|
||||||
@@ -36,6 +38,7 @@ extern void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size);
|
|||||||
extern void gic_cpu_init_gicv3(unsigned long cpu_base_pa, unsigned long size);
|
extern void gic_cpu_init_gicv3(unsigned long cpu_base_pa, unsigned long size);
|
||||||
extern void gic_enable_gicv3(void);
|
extern void gic_enable_gicv3(void);
|
||||||
extern void arm64_issue_ipi_gicv3(unsigned int cpuid, unsigned int vector);
|
extern void arm64_issue_ipi_gicv3(unsigned int cpuid, unsigned int vector);
|
||||||
|
extern void arm64_issue_host_ipi_gicv3(uint32_t cpuid, uint32_t vector);
|
||||||
extern void handle_interrupt_gicv3(struct pt_regs *regs);
|
extern void handle_interrupt_gicv3(struct pt_regs *regs);
|
||||||
|
|
||||||
void handle_IPI(unsigned int vector, struct pt_regs *regs);
|
void handle_IPI(unsigned int vector, struct pt_regs *regs);
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
/* prctl.h COPYRIGHT FUJITSU LIMITED 2017 */
|
/* prctl.h COPYRIGHT FUJITSU LIMITED 2017-2019 */
|
||||||
#ifndef __HEADER_ARM64_COMMON_PRCTL_H
|
#ifndef __HEADER_ARM64_COMMON_PRCTL_H
|
||||||
#define __HEADER_ARM64_COMMON_PRCTL_H
|
#define __HEADER_ARM64_COMMON_PRCTL_H
|
||||||
|
|
||||||
@@ -6,15 +6,12 @@
|
|||||||
#define PR_GET_THP_DISABLE 42
|
#define PR_GET_THP_DISABLE 42
|
||||||
|
|
||||||
/* arm64 Scalable Vector Extension controls */
|
/* arm64 Scalable Vector Extension controls */
|
||||||
#define PR_SVE_SET_VL 48 /* set task vector length */
|
/* Flag values must be kept in sync with ptrace NT_ARM_SVE interface */
|
||||||
#define PR_SVE_SET_VL_THREAD (1 << 1) /* set just this thread */
|
#define PR_SVE_SET_VL 50 /* set task vector length */
|
||||||
#define PR_SVE_SET_VL_INHERIT (1 << 2) /* inherit across exec */
|
# define PR_SVE_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */
|
||||||
#define PR_SVE_SET_VL_ONEXEC (1 << 3) /* defer effect until exec */
|
#define PR_SVE_GET_VL 51 /* get task vector length */
|
||||||
|
/* Bits common to PR_SVE_SET_VL and PR_SVE_GET_VL */
|
||||||
#define PR_SVE_GET_VL 49 /* get task vector length */
|
# define PR_SVE_VL_LEN_MASK 0xffff
|
||||||
/* Decode helpers for the return value from PR_SVE_GET_VL: */
|
# define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */
|
||||||
#define PR_SVE_GET_VL_LEN(ret) ((ret) & 0x3fff) /* vector length */
|
|
||||||
#define PR_SVE_GET_VL_INHERIT (PR_SVE_SET_VL_INHERIT << 16)
|
|
||||||
/* For conveinence, PR_SVE_SET_VL returns the result in the same encoding */
|
|
||||||
|
|
||||||
#endif /* !__HEADER_ARM64_COMMON_PRCTL_H */
|
#endif /* !__HEADER_ARM64_COMMON_PRCTL_H */
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
/* ptrace.h COPYRIGHT FUJITSU LIMITED 2015-2017 */
|
/* ptrace.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||||
#ifndef __HEADER_ARM64_COMMON_PTRACE_H
|
#ifndef __HEADER_ARM64_COMMON_PTRACE_H
|
||||||
#define __HEADER_ARM64_COMMON_PTRACE_H
|
#define __HEADER_ARM64_COMMON_PTRACE_H
|
||||||
|
|
||||||
@@ -46,6 +46,7 @@
|
|||||||
|
|
||||||
#ifndef __ASSEMBLY__
|
#ifndef __ASSEMBLY__
|
||||||
|
|
||||||
|
#include <lwk/compiler.h>
|
||||||
#include <ihk/types.h>
|
#include <ihk/types.h>
|
||||||
|
|
||||||
struct user_hwdebug_state {
|
struct user_hwdebug_state {
|
||||||
@@ -78,6 +79,70 @@ struct user_sve_header {
|
|||||||
uint16_t __reserved;
|
uint16_t __reserved;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum aarch64_regset {
|
||||||
|
REGSET_GPR,
|
||||||
|
REGSET_FPR,
|
||||||
|
REGSET_TLS,
|
||||||
|
REGSET_HW_BREAK,
|
||||||
|
REGSET_HW_WATCH,
|
||||||
|
REGSET_SYSTEM_CALL,
|
||||||
|
#ifdef CONFIG_ARM64_SVE
|
||||||
|
REGSET_SVE,
|
||||||
|
#endif /* CONFIG_ARM64_SVE */
|
||||||
|
};
|
||||||
|
|
||||||
|
struct thread;
|
||||||
|
struct user_regset;
|
||||||
|
|
||||||
|
typedef int user_regset_active_fn(struct thread *target,
|
||||||
|
const struct user_regset *regset);
|
||||||
|
|
||||||
|
typedef long user_regset_get_fn(struct thread *target,
|
||||||
|
const struct user_regset *regset,
|
||||||
|
unsigned int pos, unsigned int count,
|
||||||
|
void *kbuf, void __user *ubuf);
|
||||||
|
|
||||||
|
typedef long user_regset_set_fn(struct thread *target,
|
||||||
|
const struct user_regset *regset,
|
||||||
|
unsigned int pos, unsigned int count,
|
||||||
|
const void *kbuf, const void __user *ubuf);
|
||||||
|
|
||||||
|
typedef int user_regset_writeback_fn(struct thread *target,
|
||||||
|
const struct user_regset *regset,
|
||||||
|
int immediate);
|
||||||
|
|
||||||
|
typedef unsigned int user_regset_get_size_fn(struct thread *target,
|
||||||
|
const struct user_regset *regset);
|
||||||
|
|
||||||
|
struct user_regset {
|
||||||
|
user_regset_get_fn *get;
|
||||||
|
user_regset_set_fn *set;
|
||||||
|
user_regset_active_fn *active;
|
||||||
|
user_regset_writeback_fn *writeback;
|
||||||
|
user_regset_get_size_fn *get_size;
|
||||||
|
unsigned int n;
|
||||||
|
unsigned int size;
|
||||||
|
unsigned int align;
|
||||||
|
unsigned int bias;
|
||||||
|
unsigned int core_note_type;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct user_regset_view {
|
||||||
|
const char *name;
|
||||||
|
const struct user_regset *regsets;
|
||||||
|
unsigned int n;
|
||||||
|
uint32_t e_flags;
|
||||||
|
uint16_t e_machine;
|
||||||
|
uint8_t ei_osabi;
|
||||||
|
};
|
||||||
|
|
||||||
|
extern const struct user_regset_view *current_user_regset_view(void);
|
||||||
|
extern const struct user_regset *find_regset(
|
||||||
|
const struct user_regset_view *view,
|
||||||
|
unsigned int type);
|
||||||
|
extern unsigned int regset_size(struct thread *target,
|
||||||
|
const struct user_regset *regset);
|
||||||
|
|
||||||
/* Definitions for user_sve_header.flags: */
|
/* Definitions for user_sve_header.flags: */
|
||||||
#define SVE_PT_REGS_MASK (1 << 0)
|
#define SVE_PT_REGS_MASK (1 << 0)
|
||||||
|
|
||||||
@@ -85,7 +150,7 @@ struct user_sve_header {
|
|||||||
#define SVE_PT_REGS_SVE SVE_PT_REGS_MASK
|
#define SVE_PT_REGS_SVE SVE_PT_REGS_MASK
|
||||||
|
|
||||||
#define SVE_PT_VL_THREAD PR_SVE_SET_VL_THREAD
|
#define SVE_PT_VL_THREAD PR_SVE_SET_VL_THREAD
|
||||||
#define SVE_PT_VL_INHERIT PR_SVE_SET_VL_INHERIT
|
#define SVE_PT_VL_INHERIT PR_SVE_VL_INHERIT
|
||||||
#define SVE_PT_VL_ONEXEC PR_SVE_SET_VL_ONEXEC
|
#define SVE_PT_VL_ONEXEC PR_SVE_SET_VL_ONEXEC
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -99,7 +164,9 @@ struct user_sve_header {
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/* Offset from the start of struct user_sve_header to the register data */
|
/* Offset from the start of struct user_sve_header to the register data */
|
||||||
#define SVE_PT_REGS_OFFSET ((sizeof(struct sve_context) + 15) / 16 * 16)
|
#define SVE_PT_REGS_OFFSET \
|
||||||
|
((sizeof(struct sve_context) + (SVE_VQ_BYTES - 1)) \
|
||||||
|
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The register data content and layout depends on the value of the
|
* The register data content and layout depends on the value of the
|
||||||
@@ -174,8 +241,10 @@ struct user_sve_header {
|
|||||||
#define SVE_PT_SVE_FFR_OFFSET(vq) \
|
#define SVE_PT_SVE_FFR_OFFSET(vq) \
|
||||||
__SVE_SIG_TO_PT(SVE_SIG_FFR_OFFSET(vq))
|
__SVE_SIG_TO_PT(SVE_SIG_FFR_OFFSET(vq))
|
||||||
|
|
||||||
#define SVE_PT_SVE_FPSR_OFFSET(vq) \
|
#define SVE_PT_SVE_FPSR_OFFSET(vq) \
|
||||||
((SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq) + 15) / 16 * 16)
|
((SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq) + \
|
||||||
|
(SVE_VQ_BYTES - 1)) \
|
||||||
|
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
|
||||||
#define SVE_PT_SVE_FPCR_OFFSET(vq) \
|
#define SVE_PT_SVE_FPCR_OFFSET(vq) \
|
||||||
(SVE_PT_SVE_FPSR_OFFSET(vq) + SVE_PT_SVE_FPSR_SIZE)
|
(SVE_PT_SVE_FPSR_OFFSET(vq) + SVE_PT_SVE_FPSR_SIZE)
|
||||||
|
|
||||||
@@ -184,9 +253,10 @@ struct user_sve_header {
|
|||||||
* 128-bit boundary.
|
* 128-bit boundary.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define SVE_PT_SVE_SIZE(vq, flags) \
|
#define SVE_PT_SVE_SIZE(vq, flags) \
|
||||||
((SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE - \
|
((SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE \
|
||||||
SVE_PT_SVE_OFFSET + 15) / 16 * 16)
|
- SVE_PT_SVE_OFFSET + (SVE_VQ_BYTES - 1)) \
|
||||||
|
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
|
||||||
|
|
||||||
#define SVE_PT_SIZE(vq, flags) \
|
#define SVE_PT_SIZE(vq, flags) \
|
||||||
(((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE ? \
|
(((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE ? \
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
/* signal.h COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
/* signal.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||||
#ifndef __HEADER_ARM64_COMMON_SIGNAL_H
|
#ifndef __HEADER_ARM64_COMMON_SIGNAL_H
|
||||||
#define __HEADER_ARM64_COMMON_SIGNAL_H
|
#define __HEADER_ARM64_COMMON_SIGNAL_H
|
||||||
|
|
||||||
@@ -298,6 +298,7 @@ struct extra_context {
|
|||||||
struct _aarch64_ctx head;
|
struct _aarch64_ctx head;
|
||||||
void *data; /* 16-byte aligned pointer to the extra space */
|
void *data; /* 16-byte aligned pointer to the extra space */
|
||||||
uint32_t size; /* size in bytes of the extra space */
|
uint32_t size; /* size in bytes of the extra space */
|
||||||
|
uint32_t __reserved[3];
|
||||||
};
|
};
|
||||||
|
|
||||||
#define SVE_MAGIC 0x53564501
|
#define SVE_MAGIC 0x53564501
|
||||||
@@ -318,19 +319,25 @@ struct sve_context {
|
|||||||
* The SVE architecture leaves space for future expansion of the
|
* The SVE architecture leaves space for future expansion of the
|
||||||
* vector length beyond its initial architectural limit of 2048 bits
|
* vector length beyond its initial architectural limit of 2048 bits
|
||||||
* (16 quadwords).
|
* (16 quadwords).
|
||||||
|
*
|
||||||
|
* See linux/Documentation/arm64/sve.txt for a description of the VL/VQ
|
||||||
|
* terminology.
|
||||||
*/
|
*/
|
||||||
#define SVE_VQ_MIN 1
|
#define SVE_VQ_BYTES 16 /* number of bytes per quadword */
|
||||||
#define SVE_VQ_MAX 0x200
|
|
||||||
|
|
||||||
#define SVE_VL_MIN (SVE_VQ_MIN * 0x10)
|
#define SVE_VQ_MIN 1
|
||||||
#define SVE_VL_MAX (SVE_VQ_MAX * 0x10)
|
#define SVE_VQ_MAX 512
|
||||||
|
|
||||||
|
#define SVE_VL_MIN (SVE_VQ_MIN * SVE_VQ_BYTES)
|
||||||
|
#define SVE_VL_MAX (SVE_VQ_MAX * SVE_VQ_BYTES)
|
||||||
|
|
||||||
#define SVE_NUM_ZREGS 32
|
#define SVE_NUM_ZREGS 32
|
||||||
#define SVE_NUM_PREGS 16
|
#define SVE_NUM_PREGS 16
|
||||||
|
|
||||||
#define sve_vl_valid(vl) \
|
#define sve_vl_valid(vl) \
|
||||||
((vl) % 0x10 == 0 && (vl) >= SVE_VL_MIN && (vl) <= SVE_VL_MAX)
|
((vl) % SVE_VQ_BYTES == 0 && (vl) >= SVE_VL_MIN && (vl) <= SVE_VL_MAX)
|
||||||
#define sve_vq_from_vl(vl) ((vl) / 0x10)
|
#define sve_vq_from_vl(vl) ((vl) / SVE_VQ_BYTES)
|
||||||
|
#define sve_vl_from_vq(vq) ((vq) * SVE_VQ_BYTES)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The total size of meaningful data in the SVE context in bytes,
|
* The total size of meaningful data in the SVE context in bytes,
|
||||||
@@ -365,11 +372,13 @@ struct sve_context {
|
|||||||
* Additional data might be appended in the future.
|
* Additional data might be appended in the future.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define SVE_SIG_ZREG_SIZE(vq) ((uint32_t)(vq) * 16)
|
#define SVE_SIG_ZREG_SIZE(vq) ((uint32_t)(vq) * SVE_VQ_BYTES)
|
||||||
#define SVE_SIG_PREG_SIZE(vq) ((uint32_t)(vq) * 2)
|
#define SVE_SIG_PREG_SIZE(vq) ((uint32_t)(vq) * (SVE_VQ_BYTES / 8))
|
||||||
#define SVE_SIG_FFR_SIZE(vq) SVE_SIG_PREG_SIZE(vq)
|
#define SVE_SIG_FFR_SIZE(vq) SVE_SIG_PREG_SIZE(vq)
|
||||||
|
|
||||||
#define SVE_SIG_REGS_OFFSET ((sizeof(struct sve_context) + 15) / 16 * 16)
|
#define SVE_SIG_REGS_OFFSET \
|
||||||
|
((sizeof(struct sve_context) + (SVE_VQ_BYTES - 1)) \
|
||||||
|
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
|
||||||
|
|
||||||
#define SVE_SIG_ZREGS_OFFSET SVE_SIG_REGS_OFFSET
|
#define SVE_SIG_ZREGS_OFFSET SVE_SIG_REGS_OFFSET
|
||||||
#define SVE_SIG_ZREG_OFFSET(vq, n) \
|
#define SVE_SIG_ZREG_OFFSET(vq, n) \
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
SYSCALL_DELEGATED(4, io_getevents)
|
SYSCALL_DELEGATED(4, io_getevents)
|
||||||
SYSCALL_DELEGATED(17, getcwd)
|
SYSCALL_DELEGATED(17, getcwd)
|
||||||
SYSCALL_DELEGATED(22, epoll_pwait)
|
SYSCALL_HANDLED(22, epoll_pwait)
|
||||||
SYSCALL_DELEGATED(25, fcntl)
|
SYSCALL_DELEGATED(25, fcntl)
|
||||||
SYSCALL_HANDLED(29, ioctl)
|
SYSCALL_HANDLED(29, ioctl)
|
||||||
SYSCALL_DELEGATED(35, unlinkat)
|
SYSCALL_DELEGATED(35, unlinkat)
|
||||||
@@ -17,8 +17,8 @@ SYSCALL_DELEGATED(64, write)
|
|||||||
SYSCALL_DELEGATED(66, writev)
|
SYSCALL_DELEGATED(66, writev)
|
||||||
SYSCALL_DELEGATED(67, pread64)
|
SYSCALL_DELEGATED(67, pread64)
|
||||||
SYSCALL_DELEGATED(68, pwrite64)
|
SYSCALL_DELEGATED(68, pwrite64)
|
||||||
SYSCALL_DELEGATED(72, pselect6)
|
SYSCALL_HANDLED(72, pselect6)
|
||||||
SYSCALL_DELEGATED(73, ppoll)
|
SYSCALL_HANDLED(73, ppoll)
|
||||||
SYSCALL_HANDLED(74, signalfd4)
|
SYSCALL_HANDLED(74, signalfd4)
|
||||||
SYSCALL_DELEGATED(78, readlinkat)
|
SYSCALL_DELEGATED(78, readlinkat)
|
||||||
SYSCALL_DELEGATED(80, fstat)
|
SYSCALL_DELEGATED(80, fstat)
|
||||||
@@ -83,6 +83,7 @@ SYSCALL_HANDLED(175, geteuid)
|
|||||||
SYSCALL_HANDLED(176, getgid)
|
SYSCALL_HANDLED(176, getgid)
|
||||||
SYSCALL_HANDLED(177, getegid)
|
SYSCALL_HANDLED(177, getegid)
|
||||||
SYSCALL_HANDLED(178, gettid)
|
SYSCALL_HANDLED(178, gettid)
|
||||||
|
SYSCALL_HANDLED(179, sysinfo)
|
||||||
SYSCALL_DELEGATED(188, msgrcv)
|
SYSCALL_DELEGATED(188, msgrcv)
|
||||||
SYSCALL_DELEGATED(189, msgsnd)
|
SYSCALL_DELEGATED(189, msgsnd)
|
||||||
SYSCALL_DELEGATED(192, semtimedop)
|
SYSCALL_DELEGATED(192, semtimedop)
|
||||||
@@ -111,7 +112,7 @@ SYSCALL_HANDLED(236, get_mempolicy)
|
|||||||
SYSCALL_HANDLED(237, set_mempolicy)
|
SYSCALL_HANDLED(237, set_mempolicy)
|
||||||
SYSCALL_HANDLED(238, migrate_pages)
|
SYSCALL_HANDLED(238, migrate_pages)
|
||||||
SYSCALL_HANDLED(239, move_pages)
|
SYSCALL_HANDLED(239, move_pages)
|
||||||
#ifdef PERF_ENABLE
|
#ifdef ENABLE_PERF
|
||||||
SYSCALL_HANDLED(241, perf_event_open)
|
SYSCALL_HANDLED(241, perf_event_open)
|
||||||
#else // PERF_ENABLE
|
#else // PERF_ENABLE
|
||||||
SYSCALL_DELEGATED(241, perf_event_open)
|
SYSCALL_DELEGATED(241, perf_event_open)
|
||||||
@@ -119,12 +120,7 @@ SYSCALL_DELEGATED(241, perf_event_open)
|
|||||||
SYSCALL_HANDLED(260, wait4)
|
SYSCALL_HANDLED(260, wait4)
|
||||||
SYSCALL_HANDLED(270, process_vm_readv)
|
SYSCALL_HANDLED(270, process_vm_readv)
|
||||||
SYSCALL_HANDLED(271, process_vm_writev)
|
SYSCALL_HANDLED(271, process_vm_writev)
|
||||||
#ifdef PERF_ENABLE
|
SYSCALL_HANDLED(281, execveat)
|
||||||
SYSCALL_HANDLED(601, pmc_init)
|
|
||||||
SYSCALL_HANDLED(602, pmc_start)
|
|
||||||
SYSCALL_HANDLED(603, pmc_stop)
|
|
||||||
SYSCALL_HANDLED(604, pmc_reset)
|
|
||||||
#endif // PERF_ENABLE
|
|
||||||
SYSCALL_HANDLED(700, get_cpu_id)
|
SYSCALL_HANDLED(700, get_cpu_id)
|
||||||
#ifdef PROFILE_ENABLE
|
#ifdef PROFILE_ENABLE
|
||||||
SYSCALL_HANDLED(__NR_profile, profile)
|
SYSCALL_HANDLED(__NR_profile, profile)
|
||||||
@@ -132,6 +128,7 @@ SYSCALL_HANDLED(__NR_profile, profile)
|
|||||||
SYSCALL_HANDLED(730, util_migrate_inter_kernel)
|
SYSCALL_HANDLED(730, util_migrate_inter_kernel)
|
||||||
SYSCALL_HANDLED(731, util_indicate_clone)
|
SYSCALL_HANDLED(731, util_indicate_clone)
|
||||||
SYSCALL_HANDLED(732, get_system)
|
SYSCALL_HANDLED(732, get_system)
|
||||||
|
SYSCALL_HANDLED(733, util_register_desc)
|
||||||
|
|
||||||
/* McKernel Specific */
|
/* McKernel Specific */
|
||||||
SYSCALL_HANDLED(801, swapout)
|
SYSCALL_HANDLED(801, swapout)
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
/* thread_info.h COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
/* thread_info.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||||
#ifndef __HEADER_ARM64_COMMON_THREAD_INFO_H
|
#ifndef __HEADER_ARM64_COMMON_THREAD_INFO_H
|
||||||
#define __HEADER_ARM64_COMMON_THREAD_INFO_H
|
#define __HEADER_ARM64_COMMON_THREAD_INFO_H
|
||||||
|
|
||||||
@@ -46,9 +46,9 @@ struct thread_info {
|
|||||||
int cpu; /* cpu */
|
int cpu; /* cpu */
|
||||||
struct cpu_context cpu_context; /* kernel_context */
|
struct cpu_context cpu_context; /* kernel_context */
|
||||||
void *sve_state; /* SVE registers, if any */
|
void *sve_state; /* SVE registers, if any */
|
||||||
uint16_t sve_vl; /* SVE vector length */
|
unsigned int sve_vl; /* SVE vector length */
|
||||||
uint16_t sve_vl_onexec; /* SVE vl after next exec */
|
unsigned int sve_vl_onexec; /* SVE vl after next exec */
|
||||||
uint16_t sve_flags; /* SVE related flags */
|
unsigned long sve_flags; /* SVE related flags */
|
||||||
unsigned long fault_address; /* fault info */
|
unsigned long fault_address; /* fault info */
|
||||||
unsigned long fault_code; /* ESR_EL1 value */
|
unsigned long fault_code; /* ESR_EL1 value */
|
||||||
};
|
};
|
||||||
@@ -56,7 +56,7 @@ struct thread_info {
|
|||||||
/* Flags for sve_flags (intentionally defined to match the prctl flags) */
|
/* Flags for sve_flags (intentionally defined to match the prctl flags) */
|
||||||
|
|
||||||
/* Inherit sve_vl and sve_flags across execve(): */
|
/* Inherit sve_vl and sve_flags across execve(): */
|
||||||
#define THREAD_VL_INHERIT PR_SVE_SET_VL_INHERIT
|
#define THREAD_VL_INHERIT PR_SVE_VL_INHERIT
|
||||||
|
|
||||||
struct arm64_cpu_local_thread {
|
struct arm64_cpu_local_thread {
|
||||||
struct thread_info thread_info;
|
struct thread_info thread_info;
|
||||||
|
|||||||
@@ -4,6 +4,7 @@
|
|||||||
#define __ASM_TRAP_H
|
#define __ASM_TRAP_H
|
||||||
|
|
||||||
#include <types.h>
|
#include <types.h>
|
||||||
|
#include <arch-lock.h>
|
||||||
|
|
||||||
struct pt_regs;
|
struct pt_regs;
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
#include <affinity.h>
|
#include <affinity.h>
|
||||||
#include <syscall.h>
|
#include <syscall.h>
|
||||||
#include <debug.h>
|
#include <ihk/debug.h>
|
||||||
#include <arch-timer.h>
|
#include <arch-timer.h>
|
||||||
#include <cls.h>
|
#include <cls.h>
|
||||||
|
|
||||||
@@ -31,10 +31,9 @@ void *cpu_base;
|
|||||||
* function, it is not necessary to perform the disable/enable
|
* function, it is not necessary to perform the disable/enable
|
||||||
* interrupts in this function as gic_raise_softirq() .
|
* interrupts in this function as gic_raise_softirq() .
|
||||||
*/
|
*/
|
||||||
static void arm64_raise_sgi_gicv2(unsigned int cpuid, unsigned int vector)
|
static void __arm64_raise_sgi_gicv2(unsigned int hw_cpuid, unsigned int vector)
|
||||||
{
|
{
|
||||||
/* Build interrupt destination of the target cpu */
|
/* Build interrupt destination of the target cpu */
|
||||||
unsigned int hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
|
|
||||||
uint8_t cpu_target_list = gic_hwid_to_affinity(hw_cpuid);
|
uint8_t cpu_target_list = gic_hwid_to_affinity(hw_cpuid);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -50,6 +49,23 @@ static void arm64_raise_sgi_gicv2(unsigned int cpuid, unsigned int vector)
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void arm64_raise_sgi_gicv2(uint32_t cpuid, uint32_t vector)
|
||||||
|
{
|
||||||
|
/* Build interrupt destination of the target CPU */
|
||||||
|
uint32_t hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
|
||||||
|
|
||||||
|
__arm64_raise_sgi_gicv2(hw_cpuid, vector);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void arm64_raise_sgi_to_host_gicv2(uint32_t cpuid, uint32_t vector)
|
||||||
|
{
|
||||||
|
/* Build interrupt destination of the target Linux/host CPU */
|
||||||
|
uint32_t hw_cpuid = ihk_mc_get_apicid(cpuid);
|
||||||
|
|
||||||
|
__arm64_raise_sgi_gicv2(hw_cpuid, vector);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* arm64_raise_spi_gicv2
|
* arm64_raise_spi_gicv2
|
||||||
* @ref.impl nothing.
|
* @ref.impl nothing.
|
||||||
@@ -77,6 +93,11 @@ static void arm64_raise_spi_gicv2(unsigned int cpuid, unsigned int vector)
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void arm64_issue_host_ipi_gicv2(uint32_t cpuid, uint32_t vector)
|
||||||
|
{
|
||||||
|
arm64_raise_sgi_to_host_gicv2(cpuid, vector);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* arm64_issue_ipi_gicv2
|
* arm64_issue_ipi_gicv2
|
||||||
* @param cpuid : hardware cpu id
|
* @param cpuid : hardware cpu id
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
#include <cputype.h>
|
#include <cputype.h>
|
||||||
#include <process.h>
|
#include <process.h>
|
||||||
#include <syscall.h>
|
#include <syscall.h>
|
||||||
#include <debug.h>
|
#include <ihk/debug.h>
|
||||||
#include <arch-timer.h>
|
#include <arch-timer.h>
|
||||||
#include <cls.h>
|
#include <cls.h>
|
||||||
|
|
||||||
@@ -195,15 +195,12 @@ static inline void gic_write_bpr1(uint32_t val)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void arm64_raise_sgi_gicv3(uint32_t cpuid, uint32_t vector)
|
static void __arm64_raise_sgi_gicv3(uint32_t hw_cpuid, uint32_t vector)
|
||||||
{
|
{
|
||||||
uint64_t mpidr, cluster_id;
|
uint64_t mpidr, cluster_id;
|
||||||
uint16_t tlist;
|
uint16_t tlist;
|
||||||
uint64_t val;
|
uint64_t val;
|
||||||
|
|
||||||
/* Build interrupt destination of the target cpu */
|
|
||||||
uint32_t hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Ensure that stores to Normal memory are visible to the
|
* Ensure that stores to Normal memory are visible to the
|
||||||
* other CPUs before issuing the IPI.
|
* other CPUs before issuing the IPI.
|
||||||
@@ -239,6 +236,22 @@ static void arm64_raise_sgi_gicv3(uint32_t cpuid, uint32_t vector)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void arm64_raise_sgi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||||
|
{
|
||||||
|
/* Build interrupt destination of the target CPU */
|
||||||
|
uint32_t hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
|
||||||
|
|
||||||
|
__arm64_raise_sgi_gicv3(hw_cpuid, vector);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void arm64_raise_sgi_to_host_gicv3(uint32_t cpuid, uint32_t vector)
|
||||||
|
{
|
||||||
|
/* Build interrupt destination of the target Linux/host CPU */
|
||||||
|
uint32_t hw_cpuid = ihk_mc_get_apicid(cpuid);
|
||||||
|
|
||||||
|
__arm64_raise_sgi_gicv3(hw_cpuid, vector);
|
||||||
|
}
|
||||||
|
|
||||||
static void arm64_raise_spi_gicv3(uint32_t cpuid, uint32_t vector)
|
static void arm64_raise_spi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||||
{
|
{
|
||||||
uint64_t spi_reg_offset;
|
uint64_t spi_reg_offset;
|
||||||
@@ -268,6 +281,11 @@ static void arm64_raise_lpi_gicv3(uint32_t cpuid, uint32_t vector)
|
|||||||
ekprintf("%s called.\n", __func__);
|
ekprintf("%s called.\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void arm64_issue_host_ipi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||||
|
{
|
||||||
|
arm64_raise_sgi_to_host_gicv3(cpuid, vector);
|
||||||
|
}
|
||||||
|
|
||||||
void arm64_issue_ipi_gicv3(uint32_t cpuid, uint32_t vector)
|
void arm64_issue_ipi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||||
{
|
{
|
||||||
dkprintf("Send irq#%d to cpuid=%d\n", vector, cpuid);
|
dkprintf("Send irq#%d to cpuid=%d\n", vector, cpuid);
|
||||||
@@ -344,9 +362,11 @@ static void init_spi_routing(uint32_t irq, uint32_t linux_cpu)
|
|||||||
|
|
||||||
void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size)
|
void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size)
|
||||||
{
|
{
|
||||||
|
#ifndef IHK_IKC_USE_LINUX_WORK_IRQ
|
||||||
extern int spi_table[];
|
extern int spi_table[];
|
||||||
extern int nr_spi_table;
|
extern int nr_spi_table;
|
||||||
int i;
|
int i;
|
||||||
|
#endif // !IHK_IKC_USE_LINUX_WORK_IRQ
|
||||||
|
|
||||||
dist_base = map_fixed_area(dist_base_pa, size, 1 /*non chachable*/);
|
dist_base = map_fixed_area(dist_base_pa, size, 1 /*non chachable*/);
|
||||||
|
|
||||||
@@ -357,6 +377,7 @@ void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef IHK_IKC_USE_LINUX_WORK_IRQ
|
||||||
/* initialize spi routing */
|
/* initialize spi routing */
|
||||||
for (i = 0; i < nr_spi_table; i++) {
|
for (i = 0; i < nr_spi_table; i++) {
|
||||||
if (spi_table[i] == -1) {
|
if (spi_table[i] == -1) {
|
||||||
@@ -364,6 +385,7 @@ void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size)
|
|||||||
}
|
}
|
||||||
init_spi_routing(spi_table[i], i);
|
init_spi_routing(spi_table[i], i);
|
||||||
}
|
}
|
||||||
|
#endif // !IHK_IKC_USE_LINUX_WORK_IRQ
|
||||||
}
|
}
|
||||||
|
|
||||||
void gic_cpu_init_gicv3(unsigned long cpu_base_pa, unsigned long size)
|
void gic_cpu_init_gicv3(unsigned long cpu_base_pa, unsigned long size)
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
/* memory.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
/* memory.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
||||||
#include <ihk/cpu.h>
|
#include <ihk/cpu.h>
|
||||||
#include <ihk/debug.h>
|
|
||||||
#include <ihk/mm.h>
|
#include <ihk/mm.h>
|
||||||
#include <types.h>
|
#include <types.h>
|
||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
@@ -14,7 +13,7 @@
|
|||||||
#include <context.h>
|
#include <context.h>
|
||||||
#include <kmalloc.h>
|
#include <kmalloc.h>
|
||||||
#include <vdso.h>
|
#include <vdso.h>
|
||||||
#include <debug.h>
|
#include <ihk/debug.h>
|
||||||
#include <rusage_private.h>
|
#include <rusage_private.h>
|
||||||
#include <cputype.h>
|
#include <cputype.h>
|
||||||
|
|
||||||
@@ -2672,17 +2671,28 @@ int set_range_l1(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start,
|
|||||||
}
|
}
|
||||||
|
|
||||||
phys = args->phys + (base - start);
|
phys = args->phys + (base - start);
|
||||||
if (__page_offset(base, PTL1_CONT_SIZE) == 0) { //check head pte
|
|
||||||
|
/* Check if we can begin / end a series of contiguous PTEs */
|
||||||
|
if (__page_offset(base, PTL1_CONT_SIZE) == 0) {
|
||||||
uintptr_t next_addr = base + PTL1_CONT_SIZE;
|
uintptr_t next_addr = base + PTL1_CONT_SIZE;
|
||||||
|
|
||||||
if (end < next_addr) {
|
if (end < next_addr) {
|
||||||
next_addr = end;
|
next_addr = end;
|
||||||
}
|
}
|
||||||
|
|
||||||
// set contiguous bit until the next head pte
|
/* Begin the series if physical address is also aligned and
|
||||||
// if phys is aligned and range does not end early.
|
* the range covers the series. Don't start or end it if
|
||||||
|
* physical address is not aligned or the range ends early.
|
||||||
|
*/
|
||||||
if (__page_offset(phys | next_addr, PTL1_CONT_SIZE) == 0) {
|
if (__page_offset(phys | next_addr, PTL1_CONT_SIZE) == 0) {
|
||||||
args->attr[0] |= PTE_CONT;
|
args->attr[0] |= PTE_CONT;
|
||||||
|
if (rusage_memory_stat_add(args->range, phys,
|
||||||
|
PTL1_CONT_SIZE,
|
||||||
|
PTL1_CONT_SIZE)) {
|
||||||
|
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||||
|
phys, __func__, base, phys,
|
||||||
|
PTL1_CONT_SIZE, PTL1_CONT_SIZE);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
args->attr[0] &= ~PTE_CONT;
|
args->attr[0] &= ~PTE_CONT;
|
||||||
}
|
}
|
||||||
@@ -2692,12 +2702,13 @@ int set_range_l1(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start,
|
|||||||
|
|
||||||
error = 0;
|
error = 0;
|
||||||
// call memory_stat_rss_add() here because pgshift is resolved here
|
// call memory_stat_rss_add() here because pgshift is resolved here
|
||||||
if (rusage_memory_stat_add(args->range, phys, PTL1_SIZE, PTL1_SIZE)) {
|
if (!(args->attr[0] & PTE_CONT)) {
|
||||||
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
if (rusage_memory_stat_add(args->range, phys,
|
||||||
phys, __func__, base, phys, PTL1_SIZE, PTL1_SIZE);
|
PTL1_SIZE, PTL1_SIZE)) {
|
||||||
} else {
|
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||||
dkprintf("%s: !calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
phys, __func__, base, phys,
|
||||||
__func__, base, phys, PTL1_SIZE, PTL1_SIZE);
|
PTL1_SIZE, PTL1_SIZE);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
out:
|
out:
|
||||||
@@ -2761,7 +2772,9 @@ retry:
|
|||||||
|
|
||||||
phys = args->phys + (base - start);
|
phys = args->phys + (base - start);
|
||||||
|
|
||||||
//check head pte
|
/* Check if we can begin / end a series of
|
||||||
|
* contiguous PTEs
|
||||||
|
*/
|
||||||
if (__page_offset(base, tbl.cont_pgsize) == 0) {
|
if (__page_offset(base, tbl.cont_pgsize) == 0) {
|
||||||
uintptr_t next_addr = base +
|
uintptr_t next_addr = base +
|
||||||
tbl.cont_pgsize;
|
tbl.cont_pgsize;
|
||||||
@@ -2770,11 +2783,24 @@ retry:
|
|||||||
next_addr = end;
|
next_addr = end;
|
||||||
}
|
}
|
||||||
|
|
||||||
// set contiguous bit until the
|
/* Begin the series if physical address
|
||||||
// next head pte if phys is aligned
|
* is also aligned and the range covers
|
||||||
// and range does not end early.
|
* the series. Don't start or end it if
|
||||||
|
* physical address is not aligned or
|
||||||
|
* the range ends early.
|
||||||
|
*/
|
||||||
if (__page_offset(phys | next_addr, tbl.cont_pgsize) == 0) {
|
if (__page_offset(phys | next_addr, tbl.cont_pgsize) == 0) {
|
||||||
args->attr[level-1] |= PTE_CONT;
|
args->attr[level-1] |= PTE_CONT;
|
||||||
|
if (rusage_memory_stat_add(args->range,
|
||||||
|
phys,
|
||||||
|
tbl.cont_pgsize,
|
||||||
|
tbl.cont_pgsize)) {
|
||||||
|
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||||
|
phys, __func__,
|
||||||
|
base, phys,
|
||||||
|
tbl.cont_pgsize,
|
||||||
|
tbl.cont_pgsize);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
args->attr[level-1] &= ~PTE_CONT;
|
args->attr[level-1] &= ~PTE_CONT;
|
||||||
}
|
}
|
||||||
@@ -2782,21 +2808,23 @@ retry:
|
|||||||
|
|
||||||
ptl_set(ptep, phys | args->attr[level-1],
|
ptl_set(ptep, phys | args->attr[level-1],
|
||||||
level);
|
level);
|
||||||
|
|
||||||
error = 0;
|
error = 0;
|
||||||
dkprintf("set_range_middle(%lx,%lx,%lx,%d):"
|
dkprintf("set_range_middle(%lx,%lx,%lx,%d):"
|
||||||
"large page. %d %lx\n",
|
"large page. %d %lx\n",
|
||||||
base, start, end, level, error, *ptep);
|
base, start, end, level, error, *ptep);
|
||||||
// Call memory_stat_rss_add() here because pgshift is resolved here
|
// Call memory_stat_rss_add() here because pgshift is resolved here
|
||||||
if (rusage_memory_stat_add(args->range, phys,
|
if (!(args->attr[level-1] & PTE_CONT)) {
|
||||||
tbl.pgsize,
|
if (rusage_memory_stat_add(args->range,
|
||||||
tbl.pgsize)) {
|
phys,
|
||||||
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
tbl.pgsize,
|
||||||
phys, __func__, base, phys,
|
tbl.pgsize)) {
|
||||||
tbl.pgsize, tbl.pgsize);
|
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||||
} else {
|
phys, __func__, base,
|
||||||
dkprintf("%s: !calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
phys,
|
||||||
__func__, base, phys,
|
tbl.pgsize,
|
||||||
tbl.pgsize, tbl.pgsize);
|
tbl.pgsize);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
@@ -2848,7 +2876,7 @@ retry:
|
|||||||
error = 0;
|
error = 0;
|
||||||
out:
|
out:
|
||||||
if (tt_pa) {
|
if (tt_pa) {
|
||||||
ihk_mc_free_pages(tt_pa, 1);
|
ihk_mc_free_pages(phys_to_virt((unsigned long)tt_pa), 1);
|
||||||
}
|
}
|
||||||
dkprintf("set_range_middle(%lx,%lx,%lx,%d): %d %lx\n",
|
dkprintf("set_range_middle(%lx,%lx,%lx,%d): %d %lx\n",
|
||||||
base, start, end, level, error, *ptep);
|
base, start, end, level, error, *ptep);
|
||||||
@@ -3200,6 +3228,7 @@ void load_page_table(struct page_table *pt)
|
|||||||
{
|
{
|
||||||
if (pt == NULL) {
|
if (pt == NULL) {
|
||||||
// load page table for idle(EL1) process.
|
// load page table for idle(EL1) process.
|
||||||
|
switch_mm(init_pt);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// load page table for user(EL0) thread.
|
// load page table for user(EL0) thread.
|
||||||
@@ -3259,7 +3288,7 @@ void *map_fixed_area(unsigned long phys, unsigned long size, int uncachable)
|
|||||||
attr |= PTATTR_UNCACHABLE;
|
attr |= PTATTR_UNCACHABLE;
|
||||||
}
|
}
|
||||||
|
|
||||||
kprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
|
dkprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
|
||||||
paligned, v, npages);
|
paligned, v, npages);
|
||||||
|
|
||||||
pt = get_init_page_table();
|
pt = get_init_page_table();
|
||||||
@@ -3335,15 +3364,15 @@ unsigned long virt_to_phys(void *v)
|
|||||||
{
|
{
|
||||||
unsigned long va = (unsigned long)v;
|
unsigned long va = (unsigned long)v;
|
||||||
|
|
||||||
if (MAP_KERNEL_START <= va) {
|
if (va >= MAP_ST_START) {
|
||||||
return va - MAP_KERNEL_START + arm64_kernel_phys_base;
|
return va - MAP_ST_START + arm64_st_phys_base;
|
||||||
}
|
}
|
||||||
return va - MAP_ST_START;
|
return va - MAP_KERNEL_START + arm64_kernel_phys_base;
|
||||||
}
|
}
|
||||||
|
|
||||||
void *phys_to_virt(unsigned long p)
|
void *phys_to_virt(unsigned long p)
|
||||||
{
|
{
|
||||||
return (void *)(p | MAP_ST_START);
|
return (void *)((p - arm64_st_phys_base) | MAP_ST_START);
|
||||||
}
|
}
|
||||||
|
|
||||||
int copy_from_user(void *dst, const void *src, size_t siz)
|
int copy_from_user(void *dst, const void *src, size_t siz)
|
||||||
@@ -3716,44 +3745,6 @@ translation_table_t* get_translation_table_as_paddr(const struct page_table *pt)
|
|||||||
return pt->tt_pa;
|
return pt->tt_pa;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_8
|
|
||||||
void remote_flush_tlb_cpumask(struct process_vm *vm,
|
|
||||||
unsigned long addr, int cpu_id)
|
|
||||||
{
|
|
||||||
unsigned long cpu;
|
|
||||||
cpu_set_t _cpu_set;
|
|
||||||
int flush_ind;
|
|
||||||
|
|
||||||
if (addr) {
|
|
||||||
flush_ind = (addr >> PAGE_SHIFT) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
|
|
||||||
}
|
|
||||||
/* Zero address denotes full TLB flush */
|
|
||||||
else {
|
|
||||||
/* Random.. */
|
|
||||||
flush_ind = (rdtsc()) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Take a copy of the cpu set so that we don't hold the lock
|
|
||||||
* all the way while interrupting other cores */
|
|
||||||
ihk_mc_spinlock_lock_noirq(&vm->address_space->cpu_set_lock);
|
|
||||||
memcpy(&_cpu_set, &vm->address_space->cpu_set, sizeof(cpu_set_t));
|
|
||||||
ihk_mc_spinlock_unlock_noirq(&vm->address_space->cpu_set_lock);
|
|
||||||
|
|
||||||
/* Loop through CPUs in this address space and interrupt them for
|
|
||||||
* TLB flush on the specified address */
|
|
||||||
for_each_set_bit(cpu, (const unsigned long*)&_cpu_set.__bits, CPU_SETSIZE) {
|
|
||||||
if (ihk_mc_get_processor_id() == cpu)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
dkprintf("remote_flush_tlb_cpumask: flush_ind: %d, addr: 0x%lX, interrupting cpu: %d\n",
|
|
||||||
flush_ind, addr, cpu);
|
|
||||||
|
|
||||||
ihk_mc_interrupt_cpu(cpu,
|
|
||||||
ihk_mc_get_vector(flush_ind + IHK_TLB_FLUSH_IRQ_VECTOR_START));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_8 */
|
|
||||||
|
|
||||||
void arch_adjust_allocate_page_size(struct page_table *pt,
|
void arch_adjust_allocate_page_size(struct page_table *pt,
|
||||||
uintptr_t fault_addr,
|
uintptr_t fault_addr,
|
||||||
pte_t *ptep,
|
pte_t *ptep,
|
||||||
|
|||||||
@@ -8,6 +8,7 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <ihk/mm.h>
|
#include <ihk/mm.h>
|
||||||
#include <irq.h>
|
#include <irq.h>
|
||||||
|
#include <process.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* @ref.impl arch/arm64/kernel/perf_event.c
|
* @ref.impl arch/arm64/kernel/perf_event.c
|
||||||
@@ -85,25 +86,17 @@ void arm64_disable_user_access_pmu_regs(void)
|
|||||||
cpu_pmu.disable_user_access_pmu_regs();
|
cpu_pmu.disable_user_access_pmu_regs();
|
||||||
}
|
}
|
||||||
|
|
||||||
extern unsigned int *arm64_march_perfmap;
|
|
||||||
|
|
||||||
static int __ihk_mc_perfctr_init(int counter, uint32_t type, uint64_t config, int mode)
|
static int __ihk_mc_perfctr_init(int counter, uint32_t type, uint64_t config, int mode)
|
||||||
{
|
{
|
||||||
int ret = -1;
|
int ret = -1;
|
||||||
unsigned long config_base = 0;
|
unsigned long config_base = 0;
|
||||||
int mapping;
|
|
||||||
|
|
||||||
mapping = cpu_pmu.map_event(type, config);
|
ret = cpu_pmu.disable_counter(1UL << counter);
|
||||||
if (mapping < 0) {
|
|
||||||
return mapping;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = cpu_pmu.disable_counter(counter);
|
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = cpu_pmu.enable_intens(counter);
|
ret = cpu_pmu.enable_intens(1UL << counter);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@@ -112,7 +105,7 @@ static int __ihk_mc_perfctr_init(int counter, uint32_t type, uint64_t config, in
|
|||||||
if (ret) {
|
if (ret) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
config_base |= (unsigned long)mapping;
|
config_base |= config;
|
||||||
cpu_pmu.write_evtype(counter, config_base);
|
cpu_pmu.write_evtype(counter, config_base);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@@ -124,68 +117,24 @@ int ihk_mc_perfctr_init_raw(int counter, uint64_t config, int mode)
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ihk_mc_perfctr_init(int counter, uint64_t config, int mode)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
ret = __ihk_mc_perfctr_init(counter, PERF_TYPE_RAW, config, mode);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ihk_mc_perfctr_start(unsigned long counter_mask)
|
int ihk_mc_perfctr_start(unsigned long counter_mask)
|
||||||
{
|
{
|
||||||
int ret = 0, i;
|
return cpu_pmu.enable_counter(counter_mask);
|
||||||
|
|
||||||
for (i = 0; i < sizeof(counter_mask) * BITS_PER_BYTE; i++) {
|
|
||||||
if (counter_mask & (1UL << i)) {
|
|
||||||
ret = cpu_pmu.enable_counter(i);
|
|
||||||
if (ret < 0) {
|
|
||||||
kprintf("%s: enable failed(idx=%d)\n",
|
|
||||||
__func__, i);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int ihk_mc_perfctr_stop(unsigned long counter_mask, int flags)
|
int ihk_mc_perfctr_stop(unsigned long counter_mask, int flags)
|
||||||
{
|
{
|
||||||
int i = 0;
|
return cpu_pmu.disable_counter(counter_mask);
|
||||||
|
|
||||||
for (i = 0; i < sizeof(counter_mask) * BITS_PER_BYTE; i++) {
|
|
||||||
if (!(counter_mask & (1UL << i)))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
int ret = 0;
|
|
||||||
|
|
||||||
ret = cpu_pmu.disable_counter(i);
|
|
||||||
if (ret < 0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (flags & IHK_MC_PERFCTR_DISABLE_INTERRUPT) {
|
|
||||||
// when ihk_mc_perfctr_start is called,
|
|
||||||
// ihk_mc_perfctr_init is also called so disable
|
|
||||||
// interrupt
|
|
||||||
ret = cpu_pmu.disable_intens(i);
|
|
||||||
if (ret < 0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int ihk_mc_perfctr_reset(int counter)
|
int ihk_mc_perfctr_reset(int counter)
|
||||||
{
|
{
|
||||||
// TODO[PMU]: ihk_mc_perfctr_setと同様にサンプリングレートの共通部実装の扱いを見てから本実装。
|
|
||||||
cpu_pmu.write_counter(counter, 0);
|
cpu_pmu.write_counter(counter, 0);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ihk_mc_perfctr_set(int counter, long val)
|
int ihk_mc_perfctr_set(int counter, long val)
|
||||||
{
|
{
|
||||||
// TODO[PMU]: 共通部でサンプリングレートの計算をして、設定するカウンタ値をvalに渡してくるようになると想定。サンプリングレートの扱いを見てから本実装。
|
|
||||||
uint32_t v = val;
|
uint32_t v = val;
|
||||||
cpu_pmu.write_counter(counter, v);
|
cpu_pmu.write_counter(counter, v);
|
||||||
return 0;
|
return 0;
|
||||||
@@ -198,6 +147,15 @@ int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ihk_mc_perfctr_alloc(struct thread *thread, struct mc_perf_event *event)
|
||||||
|
{
|
||||||
|
const int counters = ihk_mc_perf_get_num_counters();
|
||||||
|
|
||||||
|
return cpu_pmu.get_event_idx(counters,
|
||||||
|
thread->pmc_alloc_map,
|
||||||
|
event->hw_config);
|
||||||
|
}
|
||||||
|
|
||||||
unsigned long ihk_mc_perfctr_read(int counter)
|
unsigned long ihk_mc_perfctr_read(int counter)
|
||||||
{
|
{
|
||||||
unsigned long count;
|
unsigned long count;
|
||||||
@@ -205,6 +163,14 @@ unsigned long ihk_mc_perfctr_read(int counter)
|
|||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned long ihk_mc_perfctr_value(int counter, unsigned long correction)
|
||||||
|
{
|
||||||
|
unsigned long count = ihk_mc_perfctr_read(counter) + correction;
|
||||||
|
|
||||||
|
count &= ((1UL << 32) - 1);
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config,
|
int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config,
|
||||||
unsigned long pmc_status)
|
unsigned long pmc_status)
|
||||||
{
|
{
|
||||||
@@ -234,12 +200,14 @@ int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config,
|
|||||||
|
|
||||||
int ihk_mc_perf_counter_mask_check(unsigned long counter_mask)
|
int ihk_mc_perf_counter_mask_check(unsigned long counter_mask)
|
||||||
{
|
{
|
||||||
return 1;
|
return cpu_pmu.counter_mask_valid(counter_mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ihk_mc_perf_get_num_counters(void)
|
int ihk_mc_perf_get_num_counters(void)
|
||||||
{
|
{
|
||||||
return cpu_pmu.per_cpu[ihk_mc_get_processor_id()].num_events;
|
const struct per_cpu_arm_pmu *per_cpu_arm_pmu = get_per_cpu_pmu();
|
||||||
|
|
||||||
|
return per_cpu_arm_pmu->num_events;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ihk_mc_perfctr_set_extra(struct mc_perf_event *event)
|
int ihk_mc_perfctr_set_extra(struct mc_perf_event *event)
|
||||||
@@ -247,3 +215,83 @@ int ihk_mc_perfctr_set_extra(struct mc_perf_event *event)
|
|||||||
/* Nothing to do. */
|
/* Nothing to do. */
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline uint64_t arm_pmu_event_max_period(struct mc_perf_event *event)
|
||||||
|
{
|
||||||
|
return 0xFFFFFFFF;
|
||||||
|
}
|
||||||
|
|
||||||
|
int hw_perf_event_init(struct mc_perf_event *event)
|
||||||
|
{
|
||||||
|
struct hw_perf_event *hwc = &event->hw;
|
||||||
|
|
||||||
|
if (!is_sampling_event(event)) {
|
||||||
|
hwc->sample_period = arm_pmu_event_max_period(event) >> 1;
|
||||||
|
hwc->last_period = hwc->sample_period;
|
||||||
|
ihk_atomic64_set(&hwc->period_left, hwc->sample_period);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ihk_mc_event_set_period(struct mc_perf_event *event)
|
||||||
|
{
|
||||||
|
struct hw_perf_event *hwc = &event->hw;
|
||||||
|
int64_t left = ihk_atomic64_read(&hwc->period_left);
|
||||||
|
int64_t period = hwc->sample_period;
|
||||||
|
uint64_t max_period;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
max_period = arm_pmu_event_max_period(event);
|
||||||
|
if (unlikely(left <= -period)) {
|
||||||
|
left = period;
|
||||||
|
ihk_atomic64_set(&hwc->period_left, left);
|
||||||
|
hwc->last_period = period;
|
||||||
|
ret = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (unlikely(left <= 0)) {
|
||||||
|
left += period;
|
||||||
|
ihk_atomic64_set(&hwc->period_left, left);
|
||||||
|
hwc->last_period = period;
|
||||||
|
ret = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Limit the maximum period to prevent the counter value
|
||||||
|
* from overtaking the one we are about to program. In
|
||||||
|
* effect we are reducing max_period to account for
|
||||||
|
* interrupt latency (and we are being very conservative).
|
||||||
|
*/
|
||||||
|
if (left > (max_period >> 1))
|
||||||
|
left = (max_period >> 1);
|
||||||
|
|
||||||
|
ihk_atomic64_set(&hwc->prev_count, (uint64_t)-left);
|
||||||
|
|
||||||
|
cpu_pmu.write_counter(event->counter_id,
|
||||||
|
(uint64_t)(-left) & max_period);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t ihk_mc_event_update(struct mc_perf_event *event)
|
||||||
|
{
|
||||||
|
struct hw_perf_event *hwc = &event->hw;
|
||||||
|
int64_t delta;
|
||||||
|
uint64_t prev_raw_count, new_raw_count;
|
||||||
|
uint64_t max_period = arm_pmu_event_max_period(event);
|
||||||
|
|
||||||
|
again:
|
||||||
|
prev_raw_count = ihk_atomic64_read(&hwc->prev_count);
|
||||||
|
new_raw_count = cpu_pmu.read_counter(event->counter_id);
|
||||||
|
|
||||||
|
if (ihk_atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
|
||||||
|
new_raw_count) != prev_raw_count)
|
||||||
|
goto again;
|
||||||
|
|
||||||
|
delta = (new_raw_count - prev_raw_count) & max_period;
|
||||||
|
|
||||||
|
ihk_atomic64_add(delta, &event->count);
|
||||||
|
ihk_atomic64_add(-delta, &hwc->period_left);
|
||||||
|
|
||||||
|
return new_raw_count;
|
||||||
|
}
|
||||||
|
|||||||
@@ -4,7 +4,6 @@
|
|||||||
#include <ihk/perfctr.h>
|
#include <ihk/perfctr.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <ihk/debug.h>
|
#include <ihk/debug.h>
|
||||||
#include <debug.h>
|
|
||||||
#include <sysreg.h>
|
#include <sysreg.h>
|
||||||
#include <virt.h>
|
#include <virt.h>
|
||||||
#include <bitops.h>
|
#include <bitops.h>
|
||||||
@@ -21,29 +20,174 @@
|
|||||||
#define DDEBUG_DEFAULT DDEBUG_PRINT
|
#define DDEBUG_DEFAULT DDEBUG_PRINT
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* read pmevcntr<n>_el0 functions
|
||||||
|
*/
|
||||||
|
#define read_pmevcntrN_el0(N) \
|
||||||
|
static uint32_t read_pmevcntr##N##_el0(void) \
|
||||||
|
{ \
|
||||||
|
return read_sysreg(pmevcntr##N##_el0); \
|
||||||
|
}
|
||||||
|
|
||||||
|
read_pmevcntrN_el0(0)
|
||||||
|
read_pmevcntrN_el0(1)
|
||||||
|
read_pmevcntrN_el0(2)
|
||||||
|
read_pmevcntrN_el0(3)
|
||||||
|
read_pmevcntrN_el0(4)
|
||||||
|
read_pmevcntrN_el0(5)
|
||||||
|
read_pmevcntrN_el0(6)
|
||||||
|
read_pmevcntrN_el0(7)
|
||||||
|
read_pmevcntrN_el0(8)
|
||||||
|
read_pmevcntrN_el0(9)
|
||||||
|
read_pmevcntrN_el0(10)
|
||||||
|
read_pmevcntrN_el0(11)
|
||||||
|
read_pmevcntrN_el0(12)
|
||||||
|
read_pmevcntrN_el0(13)
|
||||||
|
read_pmevcntrN_el0(14)
|
||||||
|
read_pmevcntrN_el0(15)
|
||||||
|
read_pmevcntrN_el0(16)
|
||||||
|
read_pmevcntrN_el0(17)
|
||||||
|
read_pmevcntrN_el0(18)
|
||||||
|
read_pmevcntrN_el0(19)
|
||||||
|
read_pmevcntrN_el0(20)
|
||||||
|
read_pmevcntrN_el0(21)
|
||||||
|
read_pmevcntrN_el0(22)
|
||||||
|
read_pmevcntrN_el0(23)
|
||||||
|
read_pmevcntrN_el0(24)
|
||||||
|
read_pmevcntrN_el0(25)
|
||||||
|
read_pmevcntrN_el0(26)
|
||||||
|
read_pmevcntrN_el0(27)
|
||||||
|
read_pmevcntrN_el0(28)
|
||||||
|
read_pmevcntrN_el0(29)
|
||||||
|
read_pmevcntrN_el0(30)
|
||||||
|
|
||||||
|
static uint32_t (* const read_pmevcntr_el0[])(void) = {
|
||||||
|
read_pmevcntr0_el0, read_pmevcntr1_el0, read_pmevcntr2_el0,
|
||||||
|
read_pmevcntr3_el0, read_pmevcntr4_el0, read_pmevcntr5_el0,
|
||||||
|
read_pmevcntr6_el0, read_pmevcntr7_el0, read_pmevcntr8_el0,
|
||||||
|
read_pmevcntr9_el0, read_pmevcntr10_el0, read_pmevcntr11_el0,
|
||||||
|
read_pmevcntr12_el0, read_pmevcntr13_el0, read_pmevcntr14_el0,
|
||||||
|
read_pmevcntr15_el0, read_pmevcntr16_el0, read_pmevcntr17_el0,
|
||||||
|
read_pmevcntr18_el0, read_pmevcntr19_el0, read_pmevcntr20_el0,
|
||||||
|
read_pmevcntr21_el0, read_pmevcntr22_el0, read_pmevcntr23_el0,
|
||||||
|
read_pmevcntr24_el0, read_pmevcntr25_el0, read_pmevcntr26_el0,
|
||||||
|
read_pmevcntr27_el0, read_pmevcntr28_el0, read_pmevcntr29_el0,
|
||||||
|
read_pmevcntr30_el0,
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c
|
* write pmevcntr<n>_el0 functions
|
||||||
* Perf Events' indices
|
|
||||||
*/
|
*/
|
||||||
#define ARMV8_IDX_CYCLE_COUNTER 0
|
#define write_pmevcntrN_el0(N) \
|
||||||
#define ARMV8_IDX_COUNTER0 1
|
static void write_pmevcntr##N##_el0(uint32_t v) \
|
||||||
#define ARMV8_IDX_COUNTER_LAST (ARMV8_IDX_CYCLE_COUNTER + get_per_cpu_pmu()->num_events - 1)
|
{ \
|
||||||
|
write_sysreg(v, pmevcntr##N##_el0); \
|
||||||
|
}
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/include/asm/perf_event.h */
|
write_pmevcntrN_el0(0)
|
||||||
#define ARMV8_PMU_MAX_COUNTERS 32
|
write_pmevcntrN_el0(1)
|
||||||
#define ARMV8_PMU_COUNTER_MASK (ARMV8_PMU_MAX_COUNTERS - 1)
|
write_pmevcntrN_el0(2)
|
||||||
|
write_pmevcntrN_el0(3)
|
||||||
|
write_pmevcntrN_el0(4)
|
||||||
|
write_pmevcntrN_el0(5)
|
||||||
|
write_pmevcntrN_el0(6)
|
||||||
|
write_pmevcntrN_el0(7)
|
||||||
|
write_pmevcntrN_el0(8)
|
||||||
|
write_pmevcntrN_el0(9)
|
||||||
|
write_pmevcntrN_el0(10)
|
||||||
|
write_pmevcntrN_el0(11)
|
||||||
|
write_pmevcntrN_el0(12)
|
||||||
|
write_pmevcntrN_el0(13)
|
||||||
|
write_pmevcntrN_el0(14)
|
||||||
|
write_pmevcntrN_el0(15)
|
||||||
|
write_pmevcntrN_el0(16)
|
||||||
|
write_pmevcntrN_el0(17)
|
||||||
|
write_pmevcntrN_el0(18)
|
||||||
|
write_pmevcntrN_el0(19)
|
||||||
|
write_pmevcntrN_el0(20)
|
||||||
|
write_pmevcntrN_el0(21)
|
||||||
|
write_pmevcntrN_el0(22)
|
||||||
|
write_pmevcntrN_el0(23)
|
||||||
|
write_pmevcntrN_el0(24)
|
||||||
|
write_pmevcntrN_el0(25)
|
||||||
|
write_pmevcntrN_el0(26)
|
||||||
|
write_pmevcntrN_el0(27)
|
||||||
|
write_pmevcntrN_el0(28)
|
||||||
|
write_pmevcntrN_el0(29)
|
||||||
|
write_pmevcntrN_el0(30)
|
||||||
|
|
||||||
|
static void (* const write_pmevcntr_el0[])(uint32_t) = {
|
||||||
|
write_pmevcntr0_el0, write_pmevcntr1_el0, write_pmevcntr2_el0,
|
||||||
|
write_pmevcntr3_el0, write_pmevcntr4_el0, write_pmevcntr5_el0,
|
||||||
|
write_pmevcntr6_el0, write_pmevcntr7_el0, write_pmevcntr8_el0,
|
||||||
|
write_pmevcntr9_el0, write_pmevcntr10_el0, write_pmevcntr11_el0,
|
||||||
|
write_pmevcntr12_el0, write_pmevcntr13_el0, write_pmevcntr14_el0,
|
||||||
|
write_pmevcntr15_el0, write_pmevcntr16_el0, write_pmevcntr17_el0,
|
||||||
|
write_pmevcntr18_el0, write_pmevcntr19_el0, write_pmevcntr20_el0,
|
||||||
|
write_pmevcntr21_el0, write_pmevcntr22_el0, write_pmevcntr23_el0,
|
||||||
|
write_pmevcntr24_el0, write_pmevcntr25_el0, write_pmevcntr26_el0,
|
||||||
|
write_pmevcntr27_el0, write_pmevcntr28_el0, write_pmevcntr29_el0,
|
||||||
|
write_pmevcntr30_el0,
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* ARMv8 low level PMU access
|
* write pmevtyper<n>_el0 functions
|
||||||
*/
|
*/
|
||||||
|
#define write_pmevtyperN_el0(N) \
|
||||||
|
static void write_pmevtyper##N##_el0(uint32_t v) \
|
||||||
|
{ \
|
||||||
|
write_sysreg(v, pmevtyper##N##_el0); \
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
write_pmevtyperN_el0(0)
|
||||||
* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c
|
write_pmevtyperN_el0(1)
|
||||||
* Perf Event to low level counters mapping
|
write_pmevtyperN_el0(2)
|
||||||
*/
|
write_pmevtyperN_el0(3)
|
||||||
#define ARMV8_IDX_TO_COUNTER(x) \
|
write_pmevtyperN_el0(4)
|
||||||
(((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK)
|
write_pmevtyperN_el0(5)
|
||||||
|
write_pmevtyperN_el0(6)
|
||||||
|
write_pmevtyperN_el0(7)
|
||||||
|
write_pmevtyperN_el0(8)
|
||||||
|
write_pmevtyperN_el0(9)
|
||||||
|
write_pmevtyperN_el0(10)
|
||||||
|
write_pmevtyperN_el0(11)
|
||||||
|
write_pmevtyperN_el0(12)
|
||||||
|
write_pmevtyperN_el0(13)
|
||||||
|
write_pmevtyperN_el0(14)
|
||||||
|
write_pmevtyperN_el0(15)
|
||||||
|
write_pmevtyperN_el0(16)
|
||||||
|
write_pmevtyperN_el0(17)
|
||||||
|
write_pmevtyperN_el0(18)
|
||||||
|
write_pmevtyperN_el0(19)
|
||||||
|
write_pmevtyperN_el0(20)
|
||||||
|
write_pmevtyperN_el0(21)
|
||||||
|
write_pmevtyperN_el0(22)
|
||||||
|
write_pmevtyperN_el0(23)
|
||||||
|
write_pmevtyperN_el0(24)
|
||||||
|
write_pmevtyperN_el0(25)
|
||||||
|
write_pmevtyperN_el0(26)
|
||||||
|
write_pmevtyperN_el0(27)
|
||||||
|
write_pmevtyperN_el0(28)
|
||||||
|
write_pmevtyperN_el0(29)
|
||||||
|
write_pmevtyperN_el0(30)
|
||||||
|
|
||||||
|
static void (* const write_pmevtyper_el0[])(uint32_t) = {
|
||||||
|
write_pmevtyper0_el0, write_pmevtyper1_el0, write_pmevtyper2_el0,
|
||||||
|
write_pmevtyper3_el0, write_pmevtyper4_el0, write_pmevtyper5_el0,
|
||||||
|
write_pmevtyper6_el0, write_pmevtyper7_el0, write_pmevtyper8_el0,
|
||||||
|
write_pmevtyper9_el0, write_pmevtyper10_el0, write_pmevtyper11_el0,
|
||||||
|
write_pmevtyper12_el0, write_pmevtyper13_el0, write_pmevtyper14_el0,
|
||||||
|
write_pmevtyper15_el0, write_pmevtyper16_el0, write_pmevtyper17_el0,
|
||||||
|
write_pmevtyper18_el0, write_pmevtyper19_el0, write_pmevtyper20_el0,
|
||||||
|
write_pmevtyper21_el0, write_pmevtyper22_el0, write_pmevtyper23_el0,
|
||||||
|
write_pmevtyper24_el0, write_pmevtyper25_el0, write_pmevtyper26_el0,
|
||||||
|
write_pmevtyper27_el0, write_pmevtyper28_el0, write_pmevtyper29_el0,
|
||||||
|
write_pmevtyper30_el0,
|
||||||
|
};
|
||||||
|
|
||||||
|
#define ARMV8_IDX_CYCLE_COUNTER 31
|
||||||
|
#define ARMV8_IDX_COUNTER0 0
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* @ref.impl linux-v4.15-rc3 arch/arm64/include/asm/perf_event.h
|
* @ref.impl linux-v4.15-rc3 arch/arm64/include/asm/perf_event.h
|
||||||
@@ -175,6 +319,10 @@
|
|||||||
|
|
||||||
/* PMUv3 HW events mapping. */
|
/* PMUv3 HW events mapping. */
|
||||||
|
|
||||||
|
/* disable -Woverride-init for the following initializations */
|
||||||
|
#pragma GCC diagnostic push
|
||||||
|
#pragma GCC diagnostic ignored "-Woverride-init"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c
|
* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c
|
||||||
* ARMv8 Architectural defined events, not all of these may
|
* ARMv8 Architectural defined events, not all of these may
|
||||||
@@ -220,6 +368,9 @@ static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
|
|||||||
[C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED,
|
[C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* restore warnings */
|
||||||
|
#pragma GCC diagnostic pop
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 drivers/perf/arm_pmu.c */
|
/* @ref.impl linux-v4.15-rc3 drivers/perf/arm_pmu.c */
|
||||||
static int
|
static int
|
||||||
armpmu_map_cache_event(const unsigned (*cache_map)
|
armpmu_map_cache_event(const unsigned (*cache_map)
|
||||||
@@ -298,11 +449,25 @@ armpmu_map_event(uint32_t type, uint64_t config,
|
|||||||
return -ENOENT;
|
return -ENOENT;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
static inline int armv8pmu_counter_mask_valid(unsigned long counter_mask)
|
||||||
|
{
|
||||||
|
int num;
|
||||||
|
unsigned long event;
|
||||||
|
unsigned long cycle;
|
||||||
|
unsigned long invalid_mask;
|
||||||
|
|
||||||
|
num = get_per_cpu_pmu()->num_events;
|
||||||
|
num--; /* Sub the CPU cycles counter */
|
||||||
|
event = ((1UL << num) - 1) << ARMV8_IDX_COUNTER0;
|
||||||
|
cycle = 1UL << ARMV8_IDX_CYCLE_COUNTER;
|
||||||
|
invalid_mask = ~(event | cycle);
|
||||||
|
|
||||||
|
return !(counter_mask & invalid_mask);
|
||||||
|
}
|
||||||
|
|
||||||
static inline int armv8pmu_counter_valid(int idx)
|
static inline int armv8pmu_counter_valid(int idx)
|
||||||
{
|
{
|
||||||
return idx >= ARMV8_IDX_CYCLE_COUNTER &&
|
return armv8pmu_counter_mask_valid(1UL << idx);
|
||||||
idx <= ARMV8_IDX_COUNTER_LAST;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||||
@@ -326,6 +491,11 @@ static inline int armv8pmu_has_overflowed(uint32_t pmovsr)
|
|||||||
return pmovsr & ARMV8_PMU_OVERFLOWED_MASK;
|
return pmovsr & ARMV8_PMU_OVERFLOWED_MASK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int armv8pmu_counter_has_overflowed(uint32_t pmnc, int idx)
|
||||||
|
{
|
||||||
|
return pmnc & BIT(idx);
|
||||||
|
}
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||||
static int __armv8_pmuv3_map_event(uint32_t type, uint64_t config,
|
static int __armv8_pmuv3_map_event(uint32_t type, uint64_t config,
|
||||||
const unsigned int (*extra_event_map)
|
const unsigned int (*extra_event_map)
|
||||||
@@ -357,6 +527,23 @@ static int armv8_pmuv3_map_event(uint32_t type, uint64_t config)
|
|||||||
return __armv8_pmuv3_map_event(type, config, NULL, NULL);
|
return __armv8_pmuv3_map_event(type, config, NULL, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int armv8_pmuv3_map_hw_event(uint64_t config)
|
||||||
|
{
|
||||||
|
return __armv8_pmuv3_map_event(PERF_TYPE_HARDWARE, config, NULL, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int armv8_pmuv3_map_cache_event(uint64_t config)
|
||||||
|
{
|
||||||
|
return __armv8_pmuv3_map_event(PERF_TYPE_HW_CACHE, config, NULL, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int armv8_pmuv3_map_raw_event(uint64_t config)
|
||||||
|
{
|
||||||
|
return __armv8_pmuv3_map_event(PERF_TYPE_RAW, config, NULL, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||||
static inline uint32_t armv8pmu_pmcr_read(void)
|
static inline uint32_t armv8pmu_pmcr_read(void)
|
||||||
{
|
{
|
||||||
@@ -371,24 +558,6 @@ static inline void armv8pmu_pmcr_write(uint32_t val)
|
|||||||
write_sysreg(val, pmcr_el0);
|
write_sysreg(val, pmcr_el0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
|
||||||
static inline int armv8pmu_select_counter(int idx)
|
|
||||||
{
|
|
||||||
uint32_t counter;
|
|
||||||
|
|
||||||
if (!armv8pmu_counter_valid(idx)) {
|
|
||||||
ekprintf("%s: The count_register#%d is not implemented.\n",
|
|
||||||
__func__, idx);
|
|
||||||
return -EINVAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
counter = ARMV8_IDX_TO_COUNTER(idx);
|
|
||||||
write_sysreg(counter, pmselr_el0);
|
|
||||||
isb();
|
|
||||||
|
|
||||||
return idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||||
static inline uint32_t armv8pmu_read_counter(int idx)
|
static inline uint32_t armv8pmu_read_counter(int idx)
|
||||||
{
|
{
|
||||||
@@ -401,8 +570,8 @@ static inline uint32_t armv8pmu_read_counter(int idx)
|
|||||||
else if (idx == ARMV8_IDX_CYCLE_COUNTER) {
|
else if (idx == ARMV8_IDX_CYCLE_COUNTER) {
|
||||||
value = read_sysreg(pmccntr_el0);
|
value = read_sysreg(pmccntr_el0);
|
||||||
}
|
}
|
||||||
else if (armv8pmu_select_counter(idx) == idx) {
|
else {
|
||||||
value = read_sysreg(pmxevcntr_el0);
|
value = read_pmevcntr_el0[idx]();
|
||||||
}
|
}
|
||||||
|
|
||||||
return value;
|
return value;
|
||||||
@@ -425,39 +594,38 @@ static inline void armv8pmu_write_counter(int idx, uint32_t value)
|
|||||||
|
|
||||||
write_sysreg(value64, pmccntr_el0);
|
write_sysreg(value64, pmccntr_el0);
|
||||||
}
|
}
|
||||||
else if (armv8pmu_select_counter(idx) == idx) {
|
else {
|
||||||
write_sysreg(value, pmxevcntr_el0);
|
write_pmevcntr_el0[idx](value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||||
static inline int armv8pmu_enable_intens(int idx)
|
static inline int armv8pmu_enable_intens(unsigned long counter_mask)
|
||||||
{
|
{
|
||||||
uint32_t counter;
|
if (!armv8pmu_counter_mask_valid(counter_mask)) {
|
||||||
|
ekprintf("%s: invalid counter mask(%#lx)\n",
|
||||||
if (!armv8pmu_counter_valid(idx)) {
|
__func__, counter_mask);
|
||||||
ekprintf("%s: The count_register#%d is not implemented.\n",
|
|
||||||
__func__, idx);
|
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
counter = ARMV8_IDX_TO_COUNTER(idx);
|
write_sysreg(counter_mask, pmintenset_el1);
|
||||||
write_sysreg(BIT(counter), pmintenset_el1);
|
return 0;
|
||||||
return idx;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||||
static inline int armv8pmu_disable_intens(int idx)
|
static inline int armv8pmu_disable_intens(unsigned long counter_mask)
|
||||||
{
|
{
|
||||||
uint32_t counter = ARMV8_IDX_TO_COUNTER(idx);
|
if (!armv8pmu_counter_mask_valid(counter_mask)) {
|
||||||
|
ekprintf("%s: invalid counter mask(%#lx)\n",
|
||||||
write_sysreg(BIT(counter), pmintenclr_el1);
|
__func__, counter_mask);
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
write_sysreg(counter_mask, pmintenclr_el1);
|
||||||
isb();
|
isb();
|
||||||
/* Clear the overflow flag in case an interrupt is pending. */
|
/* Clear the overflow flag in case an interrupt is pending. */
|
||||||
write_sysreg(BIT(counter), pmovsclr_el0);
|
write_sysreg(counter_mask, pmovsclr_el0);
|
||||||
isb();
|
isb();
|
||||||
|
return 0;
|
||||||
return idx;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||||
@@ -492,42 +660,37 @@ static int armv8pmu_set_event_filter(unsigned long *config_base, int mode)
|
|||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||||
static inline void armv8pmu_write_evtype(int idx, uint32_t val)
|
static inline void armv8pmu_write_evtype(int idx, uint32_t val)
|
||||||
{
|
{
|
||||||
if (armv8pmu_select_counter(idx) == idx) {
|
if (!armv8pmu_counter_valid(idx)) {
|
||||||
val &= ARMV8_PMU_EVTYPE_MASK;
|
ekprintf("%s: The count_register#%d is not implemented.\n",
|
||||||
write_sysreg(val, pmxevtyper_el0);
|
__func__, idx);
|
||||||
|
return;
|
||||||
|
} else if (idx != ARMV8_IDX_CYCLE_COUNTER) {
|
||||||
|
write_pmevtyper_el0[idx](val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||||
static inline int armv8pmu_enable_counter(int idx)
|
static inline int armv8pmu_enable_counter(unsigned long counter_mask)
|
||||||
{
|
{
|
||||||
uint32_t counter;
|
if (!armv8pmu_counter_mask_valid(counter_mask)) {
|
||||||
|
ekprintf("%s: invalid counter mask 0x%lx.\n",
|
||||||
if (!armv8pmu_counter_valid(idx)) {
|
__func__, counter_mask);
|
||||||
ekprintf("%s: The count_register#%d is not implemented.\n",
|
|
||||||
__func__, idx);
|
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
write_sysreg(counter_mask, pmcntenset_el0);
|
||||||
counter = ARMV8_IDX_TO_COUNTER(idx);
|
return 0;
|
||||||
write_sysreg(BIT(counter), pmcntenset_el0);
|
|
||||||
return idx;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||||
static inline int armv8pmu_disable_counter(int idx)
|
static inline int armv8pmu_disable_counter(unsigned long counter_mask)
|
||||||
{
|
{
|
||||||
uint32_t counter;
|
if (!armv8pmu_counter_mask_valid(counter_mask)) {
|
||||||
|
ekprintf("%s: invalid counter mask 0x%lx.\n",
|
||||||
if (!armv8pmu_counter_valid(idx)) {
|
__func__, counter_mask);
|
||||||
ekprintf("%s: The count_register#%d is not implemented.\n",
|
|
||||||
__func__, idx);
|
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
write_sysreg(counter_mask, pmcntenclr_el0);
|
||||||
counter = ARMV8_IDX_TO_COUNTER(idx);
|
return 0;
|
||||||
write_sysreg(BIT(counter), pmcntenclr_el0);
|
|
||||||
return idx;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||||
@@ -555,41 +718,20 @@ static void armv8pmu_stop(void)
|
|||||||
ihk_mc_spinlock_unlock(&pmu_lock, flags);
|
ihk_mc_spinlock_unlock(&pmu_lock, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
|
||||||
static void armv8pmu_disable_event(int idx)
|
|
||||||
{
|
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Disable counter and interrupt
|
|
||||||
*/
|
|
||||||
flags = ihk_mc_spinlock_lock(&pmu_lock);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Disable counter
|
|
||||||
*/
|
|
||||||
armv8pmu_disable_counter(idx);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Disable interrupt for this counter
|
|
||||||
*/
|
|
||||||
armv8pmu_disable_intens(idx);
|
|
||||||
|
|
||||||
ihk_mc_spinlock_unlock(&pmu_lock, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||||
static void armv8pmu_reset(void *info)
|
static void armv8pmu_reset(void *info)
|
||||||
{
|
{
|
||||||
struct arm_pmu *cpu_pmu = (struct arm_pmu *)info;
|
struct arm_pmu *cpu_pmu = (struct arm_pmu *)info;
|
||||||
uint32_t idx, nb_cnt =
|
uint32_t nb_cnt =
|
||||||
cpu_pmu->per_cpu[ihk_mc_get_processor_id()].num_events;
|
cpu_pmu->per_cpu[ihk_mc_get_processor_id()].num_events;
|
||||||
|
nb_cnt--; /* Sub the CPU cycles counter */
|
||||||
|
unsigned long event = ((1UL << nb_cnt) - 1) << ARMV8_IDX_COUNTER0;
|
||||||
|
unsigned long cycle = 1UL << ARMV8_IDX_CYCLE_COUNTER;
|
||||||
|
unsigned long valid_mask = event | cycle;
|
||||||
|
|
||||||
/* The counter and interrupt enable registers are unknown at reset. */
|
/* The counter and interrupt enable registers are unknown at reset. */
|
||||||
for (idx = ARMV8_IDX_CYCLE_COUNTER; idx < nb_cnt; ++idx) {
|
armv8pmu_disable_counter(valid_mask);
|
||||||
armv8pmu_disable_counter(idx);
|
armv8pmu_disable_intens(valid_mask);
|
||||||
armv8pmu_disable_intens(idx);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Initialize & Reset PMNC. Request overflow interrupt for
|
* Initialize & Reset PMNC. Request overflow interrupt for
|
||||||
@@ -603,7 +745,7 @@ static void armv8pmu_reset(void *info)
|
|||||||
static int armv8pmu_get_event_idx(int num_events, unsigned long used_mask,
|
static int armv8pmu_get_event_idx(int num_events, unsigned long used_mask,
|
||||||
unsigned long config)
|
unsigned long config)
|
||||||
{
|
{
|
||||||
int idx;
|
int idx, end;
|
||||||
unsigned long evtype = config & ARMV8_PMU_EVTYPE_EVENT;
|
unsigned long evtype = config & ARMV8_PMU_EVTYPE_EVENT;
|
||||||
|
|
||||||
/* Always prefer to place a cycle counter into the cycle counter. */
|
/* Always prefer to place a cycle counter into the cycle counter. */
|
||||||
@@ -615,7 +757,9 @@ static int armv8pmu_get_event_idx(int num_events, unsigned long used_mask,
|
|||||||
/*
|
/*
|
||||||
* Otherwise use events counters
|
* Otherwise use events counters
|
||||||
*/
|
*/
|
||||||
for (idx = ARMV8_IDX_COUNTER0; idx < num_events; ++idx) {
|
end = ARMV8_IDX_COUNTER0 + num_events;
|
||||||
|
end--; /* Sub the CPU cycles counter */
|
||||||
|
for (idx = ARMV8_IDX_COUNTER0; idx < end; ++idx) {
|
||||||
if (!(used_mask & (1UL << idx)))
|
if (!(used_mask & (1UL << idx)))
|
||||||
return idx;
|
return idx;
|
||||||
}
|
}
|
||||||
@@ -642,13 +786,11 @@ static uint32_t armv8pmu_read_num_pmnc_events(void)
|
|||||||
|
|
||||||
static void armv8pmu_handle_irq(void *priv)
|
static void armv8pmu_handle_irq(void *priv)
|
||||||
{
|
{
|
||||||
struct siginfo info;
|
|
||||||
uint32_t pmovsr;
|
uint32_t pmovsr;
|
||||||
struct thread *thread = cpu_local_var(current);
|
struct thread *thread = cpu_local_var(current);
|
||||||
struct process *proc = thread->proc;
|
struct process *proc = thread->proc;
|
||||||
long irqstate;
|
const struct per_cpu_arm_pmu *cpu_pmu = get_per_cpu_pmu();
|
||||||
struct mckfd *fdp;
|
int idx;
|
||||||
struct pt_regs *regs = (struct pt_regs *)priv;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Get and reset the IRQ flags
|
* Get and reset the IRQ flags
|
||||||
@@ -661,27 +803,40 @@ static void armv8pmu_handle_irq(void *priv)
|
|||||||
if (!armv8pmu_has_overflowed(pmovsr))
|
if (!armv8pmu_has_overflowed(pmovsr))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
if (!proc->monitoring_event) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
/*
|
/*
|
||||||
* Handle the counter(s) overflow(s)
|
* Handle the counter(s) overflow(s)
|
||||||
*/
|
*/
|
||||||
/* same as x86_64 mckernel */
|
for (idx = 0; idx < cpu_pmu->num_events; idx++) {
|
||||||
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
|
struct mc_perf_event *event = NULL;
|
||||||
for (fdp = proc->mckfd; fdp; fdp = fdp->next) {
|
struct mc_perf_event *sub;
|
||||||
if (fdp->sig_no > 0)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
|
|
||||||
|
|
||||||
if (fdp) {
|
if (!armv8pmu_counter_has_overflowed(pmovsr, idx)) {
|
||||||
memset(&info, '\0', sizeof(info));
|
continue;
|
||||||
info.si_signo = fdp->sig_no;
|
}
|
||||||
info._sifields._sigfault.si_addr = (void *)regs->pc;
|
|
||||||
info._sifields._sigpoll.si_fd = fdp->fd;
|
if (proc->monitoring_event->counter_id == idx) {
|
||||||
set_signal(fdp->sig_no, regs, &info);
|
event = proc->monitoring_event;
|
||||||
}
|
} else {
|
||||||
else {
|
list_for_each_entry(sub,
|
||||||
set_signal(SIGIO, regs, NULL);
|
&proc->monitoring_event->sibling_list,
|
||||||
|
group_entry) {
|
||||||
|
if (sub->counter_id == idx) {
|
||||||
|
event = sub;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!event) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
ihk_mc_event_update(event);
|
||||||
|
ihk_mc_event_set_period(event);
|
||||||
}
|
}
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void armv8pmu_enable_user_access_pmu_regs(void)
|
static void armv8pmu_enable_user_access_pmu_regs(void)
|
||||||
@@ -735,11 +890,15 @@ int armv8pmu_init(struct arm_pmu* cpu_pmu)
|
|||||||
cpu_pmu->write_evtype = armv8pmu_write_evtype;
|
cpu_pmu->write_evtype = armv8pmu_write_evtype;
|
||||||
cpu_pmu->get_event_idx = armv8pmu_get_event_idx;
|
cpu_pmu->get_event_idx = armv8pmu_get_event_idx;
|
||||||
cpu_pmu->map_event = armv8_pmuv3_map_event;
|
cpu_pmu->map_event = armv8_pmuv3_map_event;
|
||||||
|
cpu_pmu->map_hw_event = armv8_pmuv3_map_hw_event;
|
||||||
|
cpu_pmu->map_cache_event = armv8_pmuv3_map_cache_event;
|
||||||
|
cpu_pmu->map_raw_event = armv8_pmuv3_map_raw_event;
|
||||||
cpu_pmu->enable_user_access_pmu_regs =
|
cpu_pmu->enable_user_access_pmu_regs =
|
||||||
armv8pmu_enable_user_access_pmu_regs;
|
armv8pmu_enable_user_access_pmu_regs;
|
||||||
cpu_pmu->disable_user_access_pmu_regs =
|
cpu_pmu->disable_user_access_pmu_regs =
|
||||||
armv8pmu_disable_user_access_pmu_regs;
|
armv8pmu_disable_user_access_pmu_regs;
|
||||||
cpu_pmu->handler = &armv8pmu_handler;
|
cpu_pmu->handler = &armv8pmu_handler;
|
||||||
|
cpu_pmu->counter_mask_valid = &armv8pmu_counter_mask_valid;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -18,10 +18,9 @@
|
|||||||
#include <psci.h>
|
#include <psci.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <ihk/types.h>
|
#include <ihk/types.h>
|
||||||
#include <ihk/debug.h>
|
|
||||||
#include <compiler.h>
|
#include <compiler.h>
|
||||||
#include <lwk/compiler.h>
|
#include <lwk/compiler.h>
|
||||||
#include <debug.h>
|
#include <ihk/debug.h>
|
||||||
|
|
||||||
//#define DEBUG_PRINT_PSCI
|
//#define DEBUG_PRINT_PSCI
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
/* ptrace.c COPYRIGHT FUJITSU LIMITED 2016-2018 */
|
/* ptrace.c COPYRIGHT FUJITSU LIMITED 2016-2019 */
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <debug-monitors.h>
|
#include <debug-monitors.h>
|
||||||
#include <hw_breakpoint.h>
|
#include <hw_breakpoint.h>
|
||||||
@@ -11,7 +11,8 @@
|
|||||||
#include <hwcap.h>
|
#include <hwcap.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <thread_info.h>
|
#include <thread_info.h>
|
||||||
#include <debug.h>
|
#include <ptrace.h>
|
||||||
|
#include <ihk/debug.h>
|
||||||
|
|
||||||
//#define DEBUG_PRINT_SC
|
//#define DEBUG_PRINT_SC
|
||||||
|
|
||||||
@@ -25,37 +26,6 @@
|
|||||||
extern void save_debugreg(unsigned long *debugreg);
|
extern void save_debugreg(unsigned long *debugreg);
|
||||||
extern int interrupt_from_user(void *);
|
extern int interrupt_from_user(void *);
|
||||||
|
|
||||||
enum aarch64_regset {
|
|
||||||
REGSET_GPR,
|
|
||||||
REGSET_FPR,
|
|
||||||
REGSET_TLS,
|
|
||||||
REGSET_HW_BREAK,
|
|
||||||
REGSET_HW_WATCH,
|
|
||||||
REGSET_SYSTEM_CALL,
|
|
||||||
#ifdef CONFIG_ARM64_SVE
|
|
||||||
REGSET_SVE,
|
|
||||||
#endif /* CONFIG_ARM64_SVE */
|
|
||||||
};
|
|
||||||
|
|
||||||
struct user_regset;
|
|
||||||
typedef long user_regset_get_fn(struct thread *target,
|
|
||||||
const struct user_regset *regset,
|
|
||||||
unsigned int pos, unsigned int count,
|
|
||||||
void *kbuf, void __user *ubuf);
|
|
||||||
|
|
||||||
typedef long user_regset_set_fn(struct thread *target,
|
|
||||||
const struct user_regset *regset,
|
|
||||||
unsigned int pos, unsigned int count,
|
|
||||||
const void *kbuf, const void __user *ubuf);
|
|
||||||
|
|
||||||
struct user_regset {
|
|
||||||
user_regset_get_fn *get;
|
|
||||||
user_regset_set_fn *set;
|
|
||||||
unsigned int n;
|
|
||||||
unsigned int size;
|
|
||||||
unsigned int core_note_type;
|
|
||||||
};
|
|
||||||
|
|
||||||
long ptrace_read_user(struct thread *thread, long addr, unsigned long *value)
|
long ptrace_read_user(struct thread *thread, long addr, unsigned long *value)
|
||||||
{
|
{
|
||||||
return -EIO;
|
return -EIO;
|
||||||
@@ -273,6 +243,17 @@ static inline long copy_regset_from_user(struct thread *target,
|
|||||||
return regset->set(target, regset, offset, size, NULL, data);
|
return regset->set(target, regset, offset, size, NULL, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned int regset_size(struct thread *target,
|
||||||
|
const struct user_regset *regset)
|
||||||
|
{
|
||||||
|
if (!regset->get_size) {
|
||||||
|
return regset->n * regset->size;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return regset->get_size(target, regset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Bits which are always architecturally RES0 per ARM DDI 0487A.h
|
* Bits which are always architecturally RES0 per ARM DDI 0487A.h
|
||||||
* Userspace cannot use these until they have an architectural meaning.
|
* Userspace cannot use these until they have an architectural meaning.
|
||||||
@@ -624,6 +605,48 @@ out:
|
|||||||
|
|
||||||
#ifdef CONFIG_ARM64_SVE
|
#ifdef CONFIG_ARM64_SVE
|
||||||
|
|
||||||
|
static void sve_init_header_from_thread(struct user_sve_header *header,
|
||||||
|
struct thread *target)
|
||||||
|
{
|
||||||
|
unsigned int vq;
|
||||||
|
|
||||||
|
memset(header, 0, sizeof(*header));
|
||||||
|
|
||||||
|
/* McKernel processes always enable SVE. */
|
||||||
|
header->flags = SVE_PT_REGS_SVE;
|
||||||
|
|
||||||
|
if (target->ctx.thread->sve_flags & SVE_PT_VL_INHERIT) {
|
||||||
|
header->flags |= SVE_PT_VL_INHERIT;
|
||||||
|
}
|
||||||
|
|
||||||
|
header->vl = target->ctx.thread->sve_vl;
|
||||||
|
vq = sve_vq_from_vl(header->vl);
|
||||||
|
|
||||||
|
header->max_vl = sve_max_vl;
|
||||||
|
header->size = SVE_PT_SIZE(vq, header->flags);
|
||||||
|
header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl),
|
||||||
|
SVE_PT_REGS_SVE);
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned int sve_size_from_header(struct user_sve_header const *header)
|
||||||
|
{
|
||||||
|
return ALIGN(header->size, SVE_VQ_BYTES);
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned int sve_get_size(struct thread *target,
|
||||||
|
const struct user_regset *regset)
|
||||||
|
{
|
||||||
|
struct user_sve_header header;
|
||||||
|
|
||||||
|
/* Instead of system_supports_sve() */
|
||||||
|
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
sve_init_header_from_thread(&header, target);
|
||||||
|
return sve_size_from_header(&header);
|
||||||
|
}
|
||||||
|
|
||||||
/* read NT_ARM_SVE */
|
/* read NT_ARM_SVE */
|
||||||
static long sve_get(struct thread *target,
|
static long sve_get(struct thread *target,
|
||||||
const struct user_regset *regset,
|
const struct user_regset *regset,
|
||||||
@@ -646,23 +669,9 @@ static long sve_get(struct thread *target,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Header */
|
/* Header */
|
||||||
memset(&header, 0, sizeof(header));
|
sve_init_header_from_thread(&header, target);
|
||||||
|
|
||||||
header.vl = target->ctx.thread->sve_vl;
|
|
||||||
|
|
||||||
BUG_ON(!sve_vl_valid(header.vl));
|
|
||||||
vq = sve_vq_from_vl(header.vl);
|
vq = sve_vq_from_vl(header.vl);
|
||||||
|
|
||||||
BUG_ON(!sve_vl_valid(sve_max_vl));
|
|
||||||
header.max_vl = sve_max_vl;
|
|
||||||
|
|
||||||
/* McKernel processes always enable SVE. */
|
|
||||||
header.flags = SVE_PT_REGS_SVE;
|
|
||||||
|
|
||||||
header.size = SVE_PT_SIZE(vq, header.flags);
|
|
||||||
header.max_size = SVE_PT_SIZE(sve_vq_from_vl(header.max_vl),
|
|
||||||
SVE_PT_REGS_SVE);
|
|
||||||
|
|
||||||
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &header,
|
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &header,
|
||||||
0, sizeof(header));
|
0, sizeof(header));
|
||||||
if (ret) {
|
if (ret) {
|
||||||
@@ -676,11 +685,9 @@ static long sve_get(struct thread *target,
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/* Otherwise: full SVE case */
|
/* Otherwise: full SVE case */
|
||||||
|
|
||||||
start = SVE_PT_SVE_OFFSET;
|
start = SVE_PT_SVE_OFFSET;
|
||||||
end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
|
end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
|
||||||
|
|
||||||
BUG_ON(end < start);
|
|
||||||
BUG_ON(end - start > sve_state_size(target));
|
|
||||||
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
|
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
|
||||||
target->ctx.thread->sve_state,
|
target->ctx.thread->sve_state,
|
||||||
start, end);
|
start, end);
|
||||||
@@ -690,24 +697,18 @@ static long sve_get(struct thread *target,
|
|||||||
|
|
||||||
start = end;
|
start = end;
|
||||||
end = SVE_PT_SVE_FPSR_OFFSET(vq);
|
end = SVE_PT_SVE_FPSR_OFFSET(vq);
|
||||||
|
|
||||||
BUG_ON(end < start);
|
|
||||||
ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
|
ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
|
||||||
start, end);
|
start, end);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copy fpsr, and fpcr which must follow contiguously in
|
||||||
|
* struct fpsimd_state:
|
||||||
|
*/
|
||||||
start = end;
|
start = end;
|
||||||
end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
|
end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
|
||||||
|
|
||||||
BUG_ON((char *)(&target->fp_regs->fpcr + 1) <
|
|
||||||
(char *)&target->fp_regs->fpsr);
|
|
||||||
BUG_ON(end < start);
|
|
||||||
BUG_ON((char *)(&target->fp_regs->fpcr + 1) -
|
|
||||||
(char *)&target->fp_regs->fpsr !=
|
|
||||||
end - start);
|
|
||||||
|
|
||||||
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
|
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
|
||||||
&target->fp_regs->fpsr,
|
&target->fp_regs->fpsr,
|
||||||
start, end);
|
start, end);
|
||||||
@@ -716,9 +717,7 @@ static long sve_get(struct thread *target,
|
|||||||
}
|
}
|
||||||
|
|
||||||
start = end;
|
start = end;
|
||||||
end = (SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE) + 15) / 16 * 16;
|
end = sve_size_from_header(&header);
|
||||||
|
|
||||||
BUG_ON(end < start);
|
|
||||||
ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
|
ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
|
||||||
start, end);
|
start, end);
|
||||||
out:
|
out:
|
||||||
@@ -762,13 +761,12 @@ static long sve_set(struct thread *target,
|
|||||||
* sve_set_vector_length(), which will also validate them for us:
|
* sve_set_vector_length(), which will also validate them for us:
|
||||||
*/
|
*/
|
||||||
ret = sve_set_vector_length(target, header.vl,
|
ret = sve_set_vector_length(target, header.vl,
|
||||||
header.flags & ~SVE_PT_REGS_MASK);
|
((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Actual VL set may be less than the user asked for: */
|
/* Actual VL set may be less than the user asked for: */
|
||||||
BUG_ON(!sve_vl_valid(target->ctx.thread->sve_vl));
|
|
||||||
vq = sve_vq_from_vl(target->ctx.thread->sve_vl);
|
vq = sve_vq_from_vl(target->ctx.thread->sve_vl);
|
||||||
|
|
||||||
/* Registers: FPSIMD-only case */
|
/* Registers: FPSIMD-only case */
|
||||||
@@ -779,11 +777,19 @@ static long sve_set(struct thread *target,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Otherwise: full SVE case */
|
/* Otherwise: full SVE case */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If setting a different VL from the requested VL and there is
|
||||||
|
* register data, the data layout will be wrong: don't even
|
||||||
|
* try to set the registers in this case.
|
||||||
|
*/
|
||||||
|
if (count && vq != sve_vq_from_vl(header.vl)) {
|
||||||
|
ret = -EIO;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
start = SVE_PT_SVE_OFFSET;
|
start = SVE_PT_SVE_OFFSET;
|
||||||
end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
|
end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
|
||||||
|
|
||||||
BUG_ON(end < start);
|
|
||||||
BUG_ON(end - start > sve_state_size(target));
|
|
||||||
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
|
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
|
||||||
target->ctx.thread->sve_state,
|
target->ctx.thread->sve_state,
|
||||||
start, end);
|
start, end);
|
||||||
@@ -793,27 +799,21 @@ static long sve_set(struct thread *target,
|
|||||||
|
|
||||||
start = end;
|
start = end;
|
||||||
end = SVE_PT_SVE_FPSR_OFFSET(vq);
|
end = SVE_PT_SVE_FPSR_OFFSET(vq);
|
||||||
|
|
||||||
BUG_ON(end < start);
|
|
||||||
ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
|
ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
|
||||||
start, end);
|
start, end);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copy fpsr, and fpcr which must follow contiguously in
|
||||||
|
* struct fpsimd_state:
|
||||||
|
*/
|
||||||
start = end;
|
start = end;
|
||||||
end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
|
end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
|
||||||
|
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
|
||||||
BUG_ON((char *)(&target->fp_regs->fpcr + 1) <
|
&target->fp_regs->fpsr,
|
||||||
(char *)&target->fp_regs->fpsr);
|
start, end);
|
||||||
BUG_ON(end < start);
|
|
||||||
BUG_ON((char *)(&target->fp_regs->fpcr + 1) -
|
|
||||||
(char *)&target->fp_regs->fpsr !=
|
|
||||||
end - start);
|
|
||||||
|
|
||||||
user_regset_copyin(&pos, &count, &kbuf, &ubuf,
|
|
||||||
&target->fp_regs->fpsr,
|
|
||||||
start, end);
|
|
||||||
out:
|
out:
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@@ -825,8 +825,9 @@ static const struct user_regset aarch64_regsets[] = {
|
|||||||
.core_note_type = NT_PRSTATUS,
|
.core_note_type = NT_PRSTATUS,
|
||||||
.n = sizeof(struct user_pt_regs) / sizeof(uint64_t),
|
.n = sizeof(struct user_pt_regs) / sizeof(uint64_t),
|
||||||
.size = sizeof(uint64_t),
|
.size = sizeof(uint64_t),
|
||||||
|
.align = sizeof(uint64_t),
|
||||||
.get = gpr_get,
|
.get = gpr_get,
|
||||||
.set = gpr_set
|
.set = gpr_set,
|
||||||
},
|
},
|
||||||
[REGSET_FPR] = {
|
[REGSET_FPR] = {
|
||||||
.core_note_type = NT_PRFPREG,
|
.core_note_type = NT_PRFPREG,
|
||||||
@@ -836,56 +837,75 @@ static const struct user_regset aarch64_regsets[] = {
|
|||||||
* fpcr are 32-bits wide.
|
* fpcr are 32-bits wide.
|
||||||
*/
|
*/
|
||||||
.size = sizeof(uint32_t),
|
.size = sizeof(uint32_t),
|
||||||
|
.align = sizeof(uint32_t),
|
||||||
.get = fpr_get,
|
.get = fpr_get,
|
||||||
.set = fpr_set
|
.set = fpr_set,
|
||||||
},
|
},
|
||||||
[REGSET_TLS] = {
|
[REGSET_TLS] = {
|
||||||
.core_note_type = NT_ARM_TLS,
|
.core_note_type = NT_ARM_TLS,
|
||||||
.n = 1,
|
.n = 1,
|
||||||
.size = sizeof(void *),
|
.size = sizeof(void *),
|
||||||
|
.align = sizeof(void *),
|
||||||
.get = tls_get,
|
.get = tls_get,
|
||||||
.set = tls_set
|
.set = tls_set,
|
||||||
},
|
},
|
||||||
[REGSET_HW_BREAK] = {
|
[REGSET_HW_BREAK] = {
|
||||||
.core_note_type = NT_ARM_HW_BREAK,
|
.core_note_type = NT_ARM_HW_BREAK,
|
||||||
.n = sizeof(struct user_hwdebug_state) / sizeof(uint32_t),
|
.n = sizeof(struct user_hwdebug_state) / sizeof(uint32_t),
|
||||||
.size = sizeof(uint32_t),
|
.size = sizeof(uint32_t),
|
||||||
|
.align = sizeof(uint32_t),
|
||||||
.get = hw_break_get,
|
.get = hw_break_get,
|
||||||
.set = hw_break_set
|
.set = hw_break_set,
|
||||||
},
|
},
|
||||||
[REGSET_HW_WATCH] = {
|
[REGSET_HW_WATCH] = {
|
||||||
.core_note_type = NT_ARM_HW_WATCH,
|
.core_note_type = NT_ARM_HW_WATCH,
|
||||||
.n = sizeof(struct user_hwdebug_state) / sizeof(uint32_t),
|
.n = sizeof(struct user_hwdebug_state) / sizeof(uint32_t),
|
||||||
.size = sizeof(uint32_t),
|
.size = sizeof(uint32_t),
|
||||||
|
.align = sizeof(uint32_t),
|
||||||
.get = hw_break_get,
|
.get = hw_break_get,
|
||||||
.set = hw_break_set
|
.set = hw_break_set,
|
||||||
},
|
},
|
||||||
[REGSET_SYSTEM_CALL] = {
|
[REGSET_SYSTEM_CALL] = {
|
||||||
.core_note_type = NT_ARM_SYSTEM_CALL,
|
.core_note_type = NT_ARM_SYSTEM_CALL,
|
||||||
.n = 1,
|
.n = 1,
|
||||||
.size = sizeof(int),
|
.size = sizeof(int),
|
||||||
|
.align = sizeof(int),
|
||||||
.get = system_call_get,
|
.get = system_call_get,
|
||||||
.set = system_call_set
|
.set = system_call_set,
|
||||||
},
|
},
|
||||||
#ifdef CONFIG_ARM64_SVE
|
#ifdef CONFIG_ARM64_SVE
|
||||||
[REGSET_SVE] = { /* Scalable Vector Extension */
|
[REGSET_SVE] = { /* Scalable Vector Extension */
|
||||||
.core_note_type = NT_ARM_SVE,
|
.core_note_type = NT_ARM_SVE,
|
||||||
.n = (SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE) + 15) / 16,
|
.n = (SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE) +
|
||||||
.size = 16,
|
(SVE_VQ_BYTES - 1)) / SVE_VQ_BYTES,
|
||||||
|
.size = SVE_VQ_BYTES,
|
||||||
|
.align = SVE_VQ_BYTES,
|
||||||
.get = sve_get,
|
.get = sve_get,
|
||||||
.set = sve_set
|
.set = sve_set,
|
||||||
|
.get_size = sve_get_size,
|
||||||
},
|
},
|
||||||
#endif /* CONFIG_ARM64_SVE */
|
#endif /* CONFIG_ARM64_SVE */
|
||||||
};
|
};
|
||||||
|
|
||||||
static const struct user_regset *
|
static const struct user_regset_view user_aarch64_view = {
|
||||||
find_regset(const struct user_regset *regset, unsigned int type, int n)
|
.name = "aarch64", .e_machine = EM_AARCH64,
|
||||||
|
.regsets = aarch64_regsets,
|
||||||
|
.n = sizeof(aarch64_regsets) / sizeof(aarch64_regsets[0])
|
||||||
|
};
|
||||||
|
|
||||||
|
const struct user_regset_view *current_user_regset_view(void)
|
||||||
|
{
|
||||||
|
return &user_aarch64_view;
|
||||||
|
}
|
||||||
|
|
||||||
|
const struct user_regset *find_regset(const struct user_regset_view *view,
|
||||||
|
unsigned int type)
|
||||||
{
|
{
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
|
||||||
for (i = 0; i < n; i++) {
|
for (i = 0; i < view->n; i++) {
|
||||||
if (regset[i].core_note_type == type) {
|
if (view->regsets[i].core_note_type == type) {
|
||||||
return ®set[i];
|
return &view->regsets[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return NULL;
|
return NULL;
|
||||||
@@ -894,8 +914,8 @@ find_regset(const struct user_regset *regset, unsigned int type, int n)
|
|||||||
static long ptrace_regset(struct thread *thread, int req, long type, struct iovec *iov)
|
static long ptrace_regset(struct thread *thread, int req, long type, struct iovec *iov)
|
||||||
{
|
{
|
||||||
long rc = -EINVAL;
|
long rc = -EINVAL;
|
||||||
const struct user_regset *regset = find_regset(aarch64_regsets, type,
|
const struct user_regset *regset =
|
||||||
sizeof(aarch64_regsets) / sizeof(aarch64_regsets[0]));
|
find_regset(&user_aarch64_view, type);
|
||||||
|
|
||||||
if (!regset) {
|
if (!regset) {
|
||||||
kprintf("%s: not supported type 0x%x\n", __FUNCTION__, type);
|
kprintf("%s: not supported type 0x%x\n", __FUNCTION__, type);
|
||||||
@@ -944,6 +964,7 @@ void ptrace_report_signal(struct thread *thread, int sig)
|
|||||||
/* save thread_info, if called by ptrace_report_exec() */
|
/* save thread_info, if called by ptrace_report_exec() */
|
||||||
if (sig == ((SIGTRAP | (PTRACE_EVENT_EXEC << 8)))) {
|
if (sig == ((SIGTRAP | (PTRACE_EVENT_EXEC << 8)))) {
|
||||||
memcpy(&tinfo, thread->ctx.thread, sizeof(struct thread_info));
|
memcpy(&tinfo, thread->ctx.thread, sizeof(struct thread_info));
|
||||||
|
thread->uctx->user_regs.regs[0] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
mcs_rwlock_writer_lock(&proc->update_lock, &lock);
|
mcs_rwlock_writer_lock(&proc->update_lock, &lock);
|
||||||
@@ -956,6 +977,13 @@ void ptrace_report_signal(struct thread *thread, int sig)
|
|||||||
thread->exit_status = sig;
|
thread->exit_status = sig;
|
||||||
thread->status = PS_TRACED;
|
thread->status = PS_TRACED;
|
||||||
thread->ptrace &= ~PT_TRACE_SYSCALL;
|
thread->ptrace &= ~PT_TRACE_SYSCALL;
|
||||||
|
if (sig == ((SIGTRAP | (PTRACE_EVENT_EXEC << 8))) &&
|
||||||
|
thread->ptrace & PTRACE_O_TRACEEXEC) {
|
||||||
|
/* PTRACE_O_TRACEEXEC: since Linux 3.0, the former
|
||||||
|
* thread ID can be retrieved with PTRACE_GETEVENTMSG.
|
||||||
|
* Report no change. */
|
||||||
|
thread->ptrace_eventmsg = thread->tid;
|
||||||
|
}
|
||||||
save_debugreg(thread->ptrace_debugreg);
|
save_debugreg(thread->ptrace_debugreg);
|
||||||
if (sig == SIGSTOP || sig == SIGTSTP ||
|
if (sig == SIGSTOP || sig == SIGTSTP ||
|
||||||
sig == SIGTTIN || sig == SIGTTOU) {
|
sig == SIGTTIN || sig == SIGTTOU) {
|
||||||
@@ -991,6 +1019,7 @@ void ptrace_report_signal(struct thread *thread, int sig)
|
|||||||
if (sig == ((SIGTRAP | (PTRACE_EVENT_EXEC << 8)))) {
|
if (sig == ((SIGTRAP | (PTRACE_EVENT_EXEC << 8)))) {
|
||||||
memcpy(thread->ctx.thread, &tinfo, sizeof(struct thread_info));
|
memcpy(thread->ctx.thread, &tinfo, sizeof(struct thread_info));
|
||||||
}
|
}
|
||||||
|
arch_flush_icache_all();
|
||||||
}
|
}
|
||||||
|
|
||||||
long
|
long
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
/* syscall.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
/* syscall.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||||
#include <cpulocal.h>
|
#include <cpulocal.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <kmalloc.h>
|
#include <kmalloc.h>
|
||||||
@@ -15,7 +15,9 @@
|
|||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <uio.h>
|
#include <uio.h>
|
||||||
#include <syscall.h>
|
#include <syscall.h>
|
||||||
#include <debug.h>
|
#include <bitops.h>
|
||||||
|
#include <rusage_private.h>
|
||||||
|
#include <ihk/debug.h>
|
||||||
|
|
||||||
void terminate_mcexec(int, int);
|
void terminate_mcexec(int, int);
|
||||||
extern void ptrace_report_signal(struct thread *thread, int sig);
|
extern void ptrace_report_signal(struct thread *thread, int sig);
|
||||||
@@ -42,7 +44,7 @@ uintptr_t debug_constants[] = {
|
|||||||
offsetof(struct cpu_local_var, runq),
|
offsetof(struct cpu_local_var, runq),
|
||||||
offsetof(struct cpu_local_var, status),
|
offsetof(struct cpu_local_var, status),
|
||||||
offsetof(struct cpu_local_var, idle),
|
offsetof(struct cpu_local_var, idle),
|
||||||
offsetof(struct thread, ctx) + offsetof(struct thread_info, cpu_context),
|
offsetof(struct thread, ctx),
|
||||||
offsetof(struct thread, sched_list),
|
offsetof(struct thread, sched_list),
|
||||||
offsetof(struct thread, proc),
|
offsetof(struct thread, proc),
|
||||||
offsetof(struct thread, status),
|
offsetof(struct thread, status),
|
||||||
@@ -114,7 +116,7 @@ arch_clear_host_user_space()
|
|||||||
|
|
||||||
/* XXX: might be unnecessary */
|
/* XXX: might be unnecessary */
|
||||||
clear_host_pte(th->vm->region.user_start,
|
clear_host_pte(th->vm->region.user_start,
|
||||||
(th->vm->region.user_end - th->vm->region.user_start));
|
(th->vm->region.user_end - th->vm->region.user_start), 0);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -126,10 +128,18 @@ extern unsigned long do_fork(int clone_flags, unsigned long newsp,
|
|||||||
|
|
||||||
SYSCALL_DECLARE(clone)
|
SYSCALL_DECLARE(clone)
|
||||||
{
|
{
|
||||||
|
struct process *proc = cpu_local_var(current)->proc;
|
||||||
|
struct mcs_rwlock_node_irqsave lock_dump;
|
||||||
|
unsigned long ret;
|
||||||
|
|
||||||
|
/* mutex coredump */
|
||||||
|
mcs_rwlock_reader_lock(&proc->coredump_lock, &lock_dump);
|
||||||
|
|
||||||
if ((int)ihk_mc_syscall_arg0(ctx) & CLONE_VFORK) {
|
if ((int)ihk_mc_syscall_arg0(ctx) & CLONE_VFORK) {
|
||||||
return do_fork(CLONE_VFORK|SIGCHLD, 0, 0, 0, 0, ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx));
|
ret = do_fork(CLONE_VFORK|SIGCHLD, 0, 0, 0, 0,
|
||||||
|
ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx));
|
||||||
} else {
|
} else {
|
||||||
return do_fork((int)ihk_mc_syscall_arg0(ctx), /* clone_flags */
|
ret = do_fork((int)ihk_mc_syscall_arg0(ctx), /* clone_flags */
|
||||||
ihk_mc_syscall_arg1(ctx), /* newsp */
|
ihk_mc_syscall_arg1(ctx), /* newsp */
|
||||||
ihk_mc_syscall_arg2(ctx), /* parent_tidptr */
|
ihk_mc_syscall_arg2(ctx), /* parent_tidptr */
|
||||||
ihk_mc_syscall_arg4(ctx), /* child_tidptr (swap arg3) */
|
ihk_mc_syscall_arg4(ctx), /* child_tidptr (swap arg3) */
|
||||||
@@ -137,6 +147,9 @@ SYSCALL_DECLARE(clone)
|
|||||||
ihk_mc_syscall_pc(ctx), /* curpc */
|
ihk_mc_syscall_pc(ctx), /* curpc */
|
||||||
ihk_mc_syscall_sp(ctx)); /* cursp */
|
ihk_mc_syscall_sp(ctx)); /* cursp */
|
||||||
}
|
}
|
||||||
|
mcs_rwlock_reader_unlock(&proc->coredump_lock, &lock_dump);
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
SYSCALL_DECLARE(rt_sigaction)
|
SYSCALL_DECLARE(rt_sigaction)
|
||||||
@@ -178,11 +191,10 @@ SYSCALL_DECLARE(prctl)
|
|||||||
|
|
||||||
switch (option) {
|
switch (option) {
|
||||||
case PR_SVE_SET_VL:
|
case PR_SVE_SET_VL:
|
||||||
error = SVE_SET_VL(cpu_local_var(current),
|
error = SVE_SET_VL(ihk_mc_syscall_arg1(ctx));
|
||||||
ihk_mc_syscall_arg1(ctx), ihk_mc_syscall_arg2(ctx));
|
|
||||||
break;
|
break;
|
||||||
case PR_SVE_GET_VL:
|
case PR_SVE_GET_VL:
|
||||||
error = SVE_GET_VL(cpu_local_var(current));
|
error = SVE_GET_VL();
|
||||||
break;
|
break;
|
||||||
case PR_SET_THP_DISABLE:
|
case PR_SET_THP_DISABLE:
|
||||||
if (arg3 || arg4 || arg5) {
|
if (arg3 || arg4 || arg5) {
|
||||||
@@ -657,7 +669,7 @@ void set_single_step(struct thread *thread)
|
|||||||
set_regs_spsr_ss(thread->uctx);
|
set_regs_spsr_ss(thread->uctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern void coredump(struct thread *thread, void *regs);
|
extern int coredump(struct thread *thread, void *regs, int sig);
|
||||||
|
|
||||||
static int
|
static int
|
||||||
isrestart(int syscallno, unsigned long rc, int sig, int restart)
|
isrestart(int syscallno, unsigned long rc, int sig, int restart)
|
||||||
@@ -1096,6 +1108,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
|||||||
struct mcs_rwlock_node_irqsave lock;
|
struct mcs_rwlock_node_irqsave lock;
|
||||||
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
||||||
int restart = 0;
|
int restart = 0;
|
||||||
|
int ret;
|
||||||
|
|
||||||
for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1);
|
for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1);
|
||||||
dkprintf("do_signal(): tid=%d, pid=%d, sig=%d\n", thread->tid, proc->pid, sig);
|
dkprintf("do_signal(): tid=%d, pid=%d, sig=%d\n", thread->tid, proc->pid, sig);
|
||||||
@@ -1270,15 +1283,6 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
|||||||
dkprintf("SIGTRAP(): woken up\n");
|
dkprintf("SIGTRAP(): woken up\n");
|
||||||
break;
|
break;
|
||||||
case SIGCONT:
|
case SIGCONT:
|
||||||
memset(&info, '\0', sizeof info);
|
|
||||||
info.si_signo = SIGCHLD;
|
|
||||||
info.si_code = CLD_CONTINUED;
|
|
||||||
info._sifields._sigchld.si_pid = proc->pid;
|
|
||||||
info._sifields._sigchld.si_status = 0x0000ffff;
|
|
||||||
do_kill(cpu_local_var(current), proc->parent->pid, -1, SIGCHLD, &info, 0);
|
|
||||||
proc->main_thread->signal_flags = SIGNAL_STOP_CONTINUED;
|
|
||||||
proc->status = PS_RUNNING;
|
|
||||||
dkprintf("do_signal,SIGCONT,do nothing\n");
|
|
||||||
break;
|
break;
|
||||||
case SIGQUIT:
|
case SIGQUIT:
|
||||||
case SIGILL:
|
case SIGILL:
|
||||||
@@ -1290,9 +1294,31 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
|||||||
case SIGXCPU:
|
case SIGXCPU:
|
||||||
case SIGXFSZ:
|
case SIGXFSZ:
|
||||||
core:
|
core:
|
||||||
dkprintf("do_signal,default,core,sig=%d\n", sig);
|
thread->coredump_regs =
|
||||||
coredump(thread, regs);
|
kmalloc(sizeof(struct pt_regs),
|
||||||
coredumped = 0x80;
|
IHK_MC_AP_NOWAIT);
|
||||||
|
if (!thread->coredump_regs) {
|
||||||
|
kprintf("%s: Out of memory\n", __func__);
|
||||||
|
goto skip;
|
||||||
|
}
|
||||||
|
memcpy(thread->coredump_regs, regs,
|
||||||
|
sizeof(struct pt_regs));
|
||||||
|
|
||||||
|
ret = coredump(thread, regs, sig);
|
||||||
|
switch (ret) {
|
||||||
|
case -EBUSY:
|
||||||
|
kprintf("%s: INFO: coredump not performed, try ulimit -c <non-zero>\n",
|
||||||
|
__func__);
|
||||||
|
break;
|
||||||
|
case 0:
|
||||||
|
coredumped = 0x80;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
kprintf("%s: ERROR: coredump failed (%d)\n",
|
||||||
|
__func__, ret);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
skip:
|
||||||
terminate(0, sig | coredumped);
|
terminate(0, sig | coredumped);
|
||||||
break;
|
break;
|
||||||
case SIGCHLD:
|
case SIGCHLD:
|
||||||
@@ -1422,7 +1448,9 @@ __check_signal(unsigned long rc, void *regs0, int num, int irq_disabled)
|
|||||||
|
|
||||||
if(thread == NULL || thread->proc->pid == 0){
|
if(thread == NULL || thread->proc->pid == 0){
|
||||||
struct thread *t;
|
struct thread *t;
|
||||||
irqstate = ihk_mc_spinlock_lock(&(cpu_local_var(runq_lock)));
|
|
||||||
|
irqstate = cpu_disable_interrupt_save();
|
||||||
|
ihk_mc_spinlock_lock_noirq(&(cpu_local_var(runq_lock)));
|
||||||
list_for_each_entry(t, &(cpu_local_var(runq)), sched_list){
|
list_for_each_entry(t, &(cpu_local_var(runq)), sched_list){
|
||||||
if(t->proc->pid <= 0)
|
if(t->proc->pid <= 0)
|
||||||
continue;
|
continue;
|
||||||
@@ -1432,7 +1460,8 @@ __check_signal(unsigned long rc, void *regs0, int num, int irq_disabled)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), irqstate);
|
ihk_mc_spinlock_unlock_noirq(&(cpu_local_var(runq_lock)));
|
||||||
|
cpu_restore_interrupt(irqstate);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1498,7 +1527,9 @@ check_sig_pending_thread(struct thread *thread)
|
|||||||
sig++, x >>= 1)
|
sig++, x >>= 1)
|
||||||
;
|
;
|
||||||
k = thread->sigcommon->action + sig - 1;
|
k = thread->sigcommon->action + sig - 1;
|
||||||
if ((sig != SIGCHLD && sig != SIGURG) ||
|
if ((sig != SIGCHLD &&
|
||||||
|
sig != SIGURG &&
|
||||||
|
sig != SIGCONT) ||
|
||||||
(k->sa.sa_handler != SIG_IGN &&
|
(k->sa.sa_handler != SIG_IGN &&
|
||||||
k->sa.sa_handler != NULL)) {
|
k->sa.sa_handler != NULL)) {
|
||||||
if (!(pending->sigmask.__val[0] & w)) {
|
if (!(pending->sigmask.__val[0] & w)) {
|
||||||
@@ -1507,6 +1538,7 @@ check_sig_pending_thread(struct thread *thread)
|
|||||||
found = 1;
|
found = 1;
|
||||||
if (sig != SIGCHLD &&
|
if (sig != SIGCHLD &&
|
||||||
sig != SIGURG &&
|
sig != SIGURG &&
|
||||||
|
sig != SIGCONT &&
|
||||||
!k->sa.sa_handler) {
|
!k->sa.sa_handler) {
|
||||||
found = 2;
|
found = 2;
|
||||||
break;
|
break;
|
||||||
@@ -1590,7 +1622,6 @@ do_kill(struct thread * thread, int pid, int tid, int sig, siginfo_t *info, int
|
|||||||
struct list_head *head = NULL;
|
struct list_head *head = NULL;
|
||||||
int rc;
|
int rc;
|
||||||
unsigned long irqstate = 0;
|
unsigned long irqstate = 0;
|
||||||
struct k_sigaction *k;
|
|
||||||
int doint;
|
int doint;
|
||||||
int found = 0;
|
int found = 0;
|
||||||
siginfo_t info0;
|
siginfo_t info0;
|
||||||
@@ -1600,6 +1631,7 @@ do_kill(struct thread * thread, int pid, int tid, int sig, siginfo_t *info, int
|
|||||||
struct process_hash *phash = rset->process_hash;
|
struct process_hash *phash = rset->process_hash;
|
||||||
struct mcs_rwlock_node lock;
|
struct mcs_rwlock_node lock;
|
||||||
struct mcs_rwlock_node updatelock;
|
struct mcs_rwlock_node updatelock;
|
||||||
|
struct sig_pending *pending = NULL;
|
||||||
|
|
||||||
if(sig > SIGRTMAX || sig < 0)
|
if(sig > SIGRTMAX || sig < 0)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
@@ -1786,47 +1818,61 @@ done:
|
|||||||
|
|
||||||
mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node);
|
mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node);
|
||||||
|
|
||||||
/* Put signal event even when handler is SIG_IGN or SIG_DFL
|
|
||||||
because target ptraced thread must call ptrace_report_signal
|
|
||||||
in check_signal */
|
|
||||||
rc = 0;
|
rc = 0;
|
||||||
k = tthread->sigcommon->action + sig - 1;
|
|
||||||
if ((sig != SIGKILL && (tthread->ptrace & PT_TRACED)) ||
|
if (sig < SIGRTMIN) { // SIGRTMIN - SIGRTMAX
|
||||||
(k->sa.sa_handler != SIG_IGN &&
|
list_for_each_entry(pending, head, list) {
|
||||||
(k->sa.sa_handler != NULL ||
|
if (pending->sigmask.__val[0] == mask &&
|
||||||
(sig != SIGCHLD && sig != SIGURG)))) {
|
pending->ptracecont == ptracecont)
|
||||||
struct sig_pending *pending = NULL;
|
break;
|
||||||
if (sig < SIGRTMIN) { // SIGRTMIN - SIGRTMAX
|
|
||||||
list_for_each_entry(pending, head, list){
|
|
||||||
if(pending->sigmask.__val[0] == mask &&
|
|
||||||
pending->ptracecont == ptracecont)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if(&pending->list == head)
|
|
||||||
pending = NULL;
|
|
||||||
}
|
}
|
||||||
if(pending == NULL){
|
if (&pending->list == head)
|
||||||
doint = 1;
|
pending = NULL;
|
||||||
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
|
}
|
||||||
if(!pending){
|
if (pending == NULL) {
|
||||||
rc = -ENOMEM;
|
doint = 1;
|
||||||
}
|
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
|
||||||
else{
|
if (!pending) {
|
||||||
memset(pending, 0, sizeof(struct sig_pending));
|
rc = -ENOMEM;
|
||||||
pending->sigmask.__val[0] = mask;
|
}
|
||||||
memcpy(&pending->info, info, sizeof(siginfo_t));
|
else {
|
||||||
pending->ptracecont = ptracecont;
|
memset(pending, 0, sizeof(struct sig_pending));
|
||||||
if(sig == SIGKILL || sig == SIGSTOP)
|
pending->sigmask.__val[0] = mask;
|
||||||
list_add(&pending->list, head);
|
memcpy(&pending->info, info, sizeof(siginfo_t));
|
||||||
else
|
pending->ptracecont = ptracecont;
|
||||||
list_add_tail(&pending->list, head);
|
if (sig == SIGKILL || sig == SIGSTOP)
|
||||||
tthread->sigevent = 1;
|
list_add(&pending->list, head);
|
||||||
}
|
else
|
||||||
|
list_add_tail(&pending->list, head);
|
||||||
|
tthread->sigevent = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
mcs_rwlock_writer_unlock_noirq(savelock, &mcs_rw_node);
|
mcs_rwlock_writer_unlock_noirq(savelock, &mcs_rw_node);
|
||||||
cpu_restore_interrupt(irqstate);
|
cpu_restore_interrupt(irqstate);
|
||||||
|
|
||||||
|
if (sig == SIGCONT || ptracecont == 1) {
|
||||||
|
/* Wake up the target only when stopped by SIGSTOP */
|
||||||
|
if (sched_wakeup_thread(tthread, PS_STOPPED) == 0) {
|
||||||
|
struct siginfo info;
|
||||||
|
|
||||||
|
tthread->proc->main_thread->signal_flags =
|
||||||
|
SIGNAL_STOP_CONTINUED;
|
||||||
|
tthread->proc->status = PS_RUNNING;
|
||||||
|
memset(&info, '\0', sizeof(info));
|
||||||
|
info.si_signo = SIGCHLD;
|
||||||
|
info.si_code = CLD_CONTINUED;
|
||||||
|
info._sifields._sigchld.si_pid = tthread->proc->pid;
|
||||||
|
info._sifields._sigchld.si_status = 0x0000ffff;
|
||||||
|
do_kill(tthread, tthread->proc->parent->pid, -1,
|
||||||
|
SIGCHLD, &info, 0);
|
||||||
|
if (thread != tthread) {
|
||||||
|
ihk_mc_interrupt_cpu(tthread->cpu_id,
|
||||||
|
ihk_mc_get_vector(IHK_GV_IKC));
|
||||||
|
}
|
||||||
|
doint = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (doint && !(mask & tthread->sigmask.__val[0])) {
|
if (doint && !(mask & tthread->sigmask.__val[0])) {
|
||||||
int status = tthread->status;
|
int status = tthread->status;
|
||||||
|
|
||||||
@@ -1841,11 +1887,6 @@ done:
|
|||||||
/* Wake up the target only when stopped by ptrace-reporting */
|
/* Wake up the target only when stopped by ptrace-reporting */
|
||||||
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED | PS_INTERRUPTIBLE);
|
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED | PS_INTERRUPTIBLE);
|
||||||
}
|
}
|
||||||
else if(sig == SIGCONT || ptracecont == 1){
|
|
||||||
/* Wake up the target only when stopped by SIGSTOP */
|
|
||||||
sched_wakeup_thread(tthread, PS_STOPPED);
|
|
||||||
tthread->proc->status = PS_RUNNING;
|
|
||||||
}
|
|
||||||
else {
|
else {
|
||||||
sched_wakeup_thread(tthread, PS_INTERRUPTIBLE);
|
sched_wakeup_thread(tthread, PS_INTERRUPTIBLE);
|
||||||
}
|
}
|
||||||
@@ -1870,7 +1911,7 @@ set_signal(int sig, void *regs0, siginfo_t *info)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if ((__sigmask(sig) & thread->sigmask.__val[0])) {
|
if ((__sigmask(sig) & thread->sigmask.__val[0])) {
|
||||||
coredump(thread, regs0);
|
coredump(thread, regs0, sig);
|
||||||
terminate(0, sig | 0x80);
|
terminate(0, sig | 0x80);
|
||||||
}
|
}
|
||||||
do_kill(thread, thread->proc->pid, thread->tid, sig, info, 0);
|
do_kill(thread, thread->proc->pid, thread->tid, sig, info, 0);
|
||||||
@@ -1900,7 +1941,7 @@ SYSCALL_DECLARE(mmap)
|
|||||||
;
|
;
|
||||||
|
|
||||||
const uintptr_t addr0 = ihk_mc_syscall_arg0(ctx);
|
const uintptr_t addr0 = ihk_mc_syscall_arg0(ctx);
|
||||||
const size_t len0 = ihk_mc_syscall_arg1(ctx);
|
size_t len0 = ihk_mc_syscall_arg1(ctx);
|
||||||
const int prot = ihk_mc_syscall_arg2(ctx);
|
const int prot = ihk_mc_syscall_arg2(ctx);
|
||||||
const int flags0 = ihk_mc_syscall_arg3(ctx);
|
const int flags0 = ihk_mc_syscall_arg3(ctx);
|
||||||
const int fd = ihk_mc_syscall_arg4(ctx);
|
const int fd = ihk_mc_syscall_arg4(ctx);
|
||||||
@@ -1941,7 +1982,8 @@ SYSCALL_DECLARE(mmap)
|
|||||||
|
|
||||||
if (hugeshift == 0) {
|
if (hugeshift == 0) {
|
||||||
/* default hugepage size */
|
/* default hugepage size */
|
||||||
flags |= MAP_HUGE_SECOND_BLOCK;
|
flags |= ihk_mc_get_linux_default_huge_page_shift() <<
|
||||||
|
MAP_HUGE_SHIFT;
|
||||||
} else if ((first_level_block_support &&
|
} else if ((first_level_block_support &&
|
||||||
hugeshift == MAP_HUGE_FIRST_BLOCK) ||
|
hugeshift == MAP_HUGE_FIRST_BLOCK) ||
|
||||||
(first_level_block_support &&
|
(first_level_block_support &&
|
||||||
@@ -1958,6 +2000,14 @@ SYSCALL_DECLARE(mmap)
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
pgsize = (size_t)1 << ((flags >> MAP_HUGE_SHIFT) & 0x3F);
|
pgsize = (size_t)1 << ((flags >> MAP_HUGE_SHIFT) & 0x3F);
|
||||||
|
/* Round-up map length by pagesize */
|
||||||
|
len0 = ALIGN(len0, pgsize);
|
||||||
|
|
||||||
|
if (rusage_check_overmap(len0,
|
||||||
|
(flags >> MAP_HUGE_SHIFT) & 0x3F)) {
|
||||||
|
error = -ENOMEM;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define VALID_DUMMY_ADDR ((region->user_start + PTL3_SIZE - 1) & ~(PTL3_SIZE - 1))
|
#define VALID_DUMMY_ADDR ((region->user_start + PTL3_SIZE - 1) & ~(PTL3_SIZE - 1))
|
||||||
@@ -1993,7 +2043,7 @@ SYSCALL_DECLARE(mmap)
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
addr = do_mmap(addr, len, prot, flags, fd, off0);
|
addr = do_mmap(addr, len, prot, flags, fd, off0, 0, NULL);
|
||||||
|
|
||||||
error = 0;
|
error = 0;
|
||||||
out:
|
out:
|
||||||
@@ -2018,7 +2068,8 @@ SYSCALL_DECLARE(shmget)
|
|||||||
|
|
||||||
if (hugeshift == 0) {
|
if (hugeshift == 0) {
|
||||||
/* default hugepage size */
|
/* default hugepage size */
|
||||||
shmflg |= SHM_HUGE_SECOND_BLOCK;
|
shmflg |= ihk_mc_get_linux_default_huge_page_shift() <<
|
||||||
|
MAP_HUGE_SHIFT;
|
||||||
} else if ((first_level_block_support &&
|
} else if ((first_level_block_support &&
|
||||||
hugeshift == SHM_HUGE_FIRST_BLOCK) ||
|
hugeshift == SHM_HUGE_FIRST_BLOCK) ||
|
||||||
(first_level_block_support &&
|
(first_level_block_support &&
|
||||||
@@ -2082,10 +2133,10 @@ int do_process_vm_read_writev(int pid,
|
|||||||
struct process *rproc;
|
struct process *rproc;
|
||||||
struct process *lproc = lthread->proc;
|
struct process *lproc = lthread->proc;
|
||||||
struct process_vm *rvm = NULL;
|
struct process_vm *rvm = NULL;
|
||||||
unsigned long rphys;
|
unsigned long lphys, rphys;
|
||||||
unsigned long rpage_left;
|
unsigned long lpage_left, rpage_left;
|
||||||
unsigned long psize;
|
unsigned long lpsize, rpsize;
|
||||||
void *rva;
|
void *rva, *lva;
|
||||||
struct vm_range *range;
|
struct vm_range *range;
|
||||||
struct mcs_rwlock_node_irqsave lock;
|
struct mcs_rwlock_node_irqsave lock;
|
||||||
struct mcs_rwlock_node update_lock;
|
struct mcs_rwlock_node update_lock;
|
||||||
@@ -2279,10 +2330,53 @@ pri_out:
|
|||||||
to_copy = remote_iov[ri].iov_len - roff;
|
to_copy = remote_iov[ri].iov_len - roff;
|
||||||
}
|
}
|
||||||
|
|
||||||
retry_lookup:
|
retry_llookup:
|
||||||
|
/* Figure out local physical */
|
||||||
|
/* TODO: remember page and do this only if necessary */
|
||||||
|
ret = ihk_mc_pt_virt_to_phys_size(lthread->vm->address_space->page_table,
|
||||||
|
local_iov[li].iov_base + loff, &lphys, &lpsize);
|
||||||
|
|
||||||
|
if (ret) {
|
||||||
|
uint64_t reason = PF_POPULATE | PF_WRITE | PF_USER;
|
||||||
|
void *addr;
|
||||||
|
|
||||||
|
if (faulted) {
|
||||||
|
ret = -EFAULT;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Fault in pages */
|
||||||
|
for (addr = (void *)
|
||||||
|
(((unsigned long)local_iov[li].iov_base + loff)
|
||||||
|
& PAGE_MASK);
|
||||||
|
addr < (local_iov[li].iov_base + loff + to_copy);
|
||||||
|
addr += PAGE_SIZE) {
|
||||||
|
|
||||||
|
ret = page_fault_process_vm(lthread->vm, addr, reason);
|
||||||
|
if (ret) {
|
||||||
|
ret = -EFAULT;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
faulted = 1;
|
||||||
|
goto retry_llookup;
|
||||||
|
}
|
||||||
|
|
||||||
|
lpage_left = ((((unsigned long)local_iov[li].iov_base + loff +
|
||||||
|
lpsize) & ~(lpsize - 1)) -
|
||||||
|
((unsigned long)local_iov[li].iov_base + loff));
|
||||||
|
if (lpage_left < to_copy) {
|
||||||
|
to_copy = lpage_left;
|
||||||
|
}
|
||||||
|
|
||||||
|
lva = phys_to_virt(lphys);
|
||||||
|
|
||||||
|
retry_rlookup:
|
||||||
|
/* Figure out remote physical */
|
||||||
/* TODO: remember page and do this only if necessary */
|
/* TODO: remember page and do this only if necessary */
|
||||||
ret = ihk_mc_pt_virt_to_phys_size(rvm->address_space->page_table,
|
ret = ihk_mc_pt_virt_to_phys_size(rvm->address_space->page_table,
|
||||||
remote_iov[ri].iov_base + roff, &rphys, &psize);
|
remote_iov[ri].iov_base + roff, &rphys, &rpsize);
|
||||||
|
|
||||||
if (ret) {
|
if (ret) {
|
||||||
uint64_t reason = PF_POPULATE | PF_WRITE | PF_USER;
|
uint64_t reason = PF_POPULATE | PF_WRITE | PF_USER;
|
||||||
@@ -2308,11 +2402,11 @@ retry_lookup:
|
|||||||
}
|
}
|
||||||
|
|
||||||
faulted = 1;
|
faulted = 1;
|
||||||
goto retry_lookup;
|
goto retry_rlookup;
|
||||||
}
|
}
|
||||||
|
|
||||||
rpage_left = ((((unsigned long)remote_iov[ri].iov_base + roff +
|
rpage_left = ((((unsigned long)remote_iov[ri].iov_base + roff +
|
||||||
psize) & ~(psize - 1)) -
|
rpsize) & ~(rpsize - 1)) -
|
||||||
((unsigned long)remote_iov[ri].iov_base + roff));
|
((unsigned long)remote_iov[ri].iov_base + roff));
|
||||||
if (rpage_left < to_copy) {
|
if (rpage_left < to_copy) {
|
||||||
to_copy = rpage_left;
|
to_copy = rpage_left;
|
||||||
@@ -2321,16 +2415,16 @@ retry_lookup:
|
|||||||
rva = phys_to_virt(rphys);
|
rva = phys_to_virt(rphys);
|
||||||
|
|
||||||
fast_memcpy(
|
fast_memcpy(
|
||||||
(op == PROCESS_VM_READ) ? local_iov[li].iov_base + loff : rva,
|
(op == PROCESS_VM_READ) ? lva : rva,
|
||||||
(op == PROCESS_VM_READ) ? rva : local_iov[li].iov_base + loff,
|
(op == PROCESS_VM_READ) ? rva : lva,
|
||||||
to_copy);
|
to_copy);
|
||||||
|
|
||||||
copied += to_copy;
|
copied += to_copy;
|
||||||
dkprintf("local_iov[%d]: 0x%lx %s remote_iov[%d]: 0x%lx, %lu copied, psize: %lu, rpage_left: %lu\n",
|
dkprintf("local_iov[%d]: 0x%lx %s remote_iov[%d]: 0x%lx, %lu copied, rpsize: %lu, rpage_left: %lu\n",
|
||||||
li, local_iov[li].iov_base + loff,
|
li, local_iov[li].iov_base + loff,
|
||||||
(op == PROCESS_VM_READ) ? "<-" : "->",
|
(op == PROCESS_VM_READ) ? "<-" : "->",
|
||||||
ri, remote_iov[ri].iov_base + roff, to_copy,
|
ri, remote_iov[ri].iov_base + roff, to_copy,
|
||||||
psize, rpage_left);
|
rpsize, rpage_left);
|
||||||
|
|
||||||
loff += to_copy;
|
loff += to_copy;
|
||||||
roff += to_copy;
|
roff += to_copy;
|
||||||
@@ -2700,4 +2794,48 @@ SYSCALL_DECLARE(time)
|
|||||||
return time();
|
return time();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void calculate_time_from_tsc(struct timespec *ts)
|
||||||
|
{
|
||||||
|
long ver;
|
||||||
|
unsigned long current_tsc;
|
||||||
|
time_t sec_delta;
|
||||||
|
long ns_delta;
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
while ((ver = ihk_atomic64_read(&tod_data.version)) & 1) {
|
||||||
|
/* settimeofday() is in progress */
|
||||||
|
cpu_pause();
|
||||||
|
}
|
||||||
|
rmb(); /* fetch version before time */
|
||||||
|
*ts = tod_data.origin;
|
||||||
|
rmb(); /* fetch time before checking version */
|
||||||
|
if (ver == ihk_atomic64_read(&tod_data.version)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* settimeofday() has intervened */
|
||||||
|
cpu_pause();
|
||||||
|
}
|
||||||
|
|
||||||
|
current_tsc = rdtsc();
|
||||||
|
sec_delta = current_tsc / tod_data.clocks_per_sec;
|
||||||
|
ns_delta = NS_PER_SEC * (current_tsc % tod_data.clocks_per_sec)
|
||||||
|
/ tod_data.clocks_per_sec;
|
||||||
|
/* calc. of ns_delta overflows if clocks_per_sec exceeds 18.44 GHz */
|
||||||
|
|
||||||
|
ts->tv_sec += sec_delta;
|
||||||
|
ts->tv_nsec += ns_delta;
|
||||||
|
if (ts->tv_nsec >= NS_PER_SEC) {
|
||||||
|
ts->tv_nsec -= NS_PER_SEC;
|
||||||
|
++ts->tv_sec;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern void ptrace_syscall_event(struct thread *thread);
|
||||||
|
long arch_ptrace_syscall_event(struct thread *thread,
|
||||||
|
ihk_mc_user_context_t *ctx, long setret)
|
||||||
|
{
|
||||||
|
ptrace_syscall_event(thread);
|
||||||
|
return setret;
|
||||||
|
}
|
||||||
/*** End of File ***/
|
/*** End of File ***/
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
#include <cputype.h>
|
#include <cputype.h>
|
||||||
#include <irq.h>
|
#include <irq.h>
|
||||||
#include <arch-timer.h>
|
#include <arch-timer.h>
|
||||||
#include <debug.h>
|
#include <ihk/debug.h>
|
||||||
|
|
||||||
//#define DEBUG_PRINT_TIMER
|
//#define DEBUG_PRINT_TIMER
|
||||||
|
|
||||||
@@ -111,6 +111,8 @@ static void timer_handler(void *priv)
|
|||||||
/* set timer re-enable for periodic */
|
/* set timer re-enable for periodic */
|
||||||
arch_timer_reg_write(ARCH_TIMER_REG_TVAL, clocks);
|
arch_timer_reg_write(ARCH_TIMER_REG_TVAL, clocks);
|
||||||
arch_timer_reg_write(ARCH_TIMER_REG_CTRL, ctrl);
|
arch_timer_reg_write(ARCH_TIMER_REG_CTRL, ctrl);
|
||||||
|
|
||||||
|
do_backlog();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -11,10 +11,9 @@
|
|||||||
#include <process.h>
|
#include <process.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <syscall.h>
|
#include <syscall.h>
|
||||||
#include <ihk/debug.h>
|
|
||||||
#include <ikc/queue.h>
|
#include <ikc/queue.h>
|
||||||
#include <vdso.h>
|
#include <vdso.h>
|
||||||
#include <debug.h>
|
#include <ihk/debug.h>
|
||||||
|
|
||||||
//#define DEBUG_PRINT_VDSO
|
//#define DEBUG_PRINT_VDSO
|
||||||
|
|
||||||
@@ -23,7 +22,6 @@
|
|||||||
#define DDEBUG_DEFAULT DDEBUG_PRINT
|
#define DDEBUG_DEFAULT DDEBUG_PRINT
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_52
|
|
||||||
#define VDSO_MAXPAGES 1
|
#define VDSO_MAXPAGES 1
|
||||||
struct vdso {
|
struct vdso {
|
||||||
long busy;
|
long busy;
|
||||||
@@ -34,7 +32,6 @@ struct vdso {
|
|||||||
long lbase;
|
long lbase;
|
||||||
long offset_sigtramp;
|
long offset_sigtramp;
|
||||||
};
|
};
|
||||||
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
|
|
||||||
|
|
||||||
extern char vdso_start, vdso_end;
|
extern char vdso_start, vdso_end;
|
||||||
static struct vdso vdso;
|
static struct vdso vdso;
|
||||||
@@ -90,6 +87,7 @@ int arch_setup_vdso(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
panic("Only support host mapping vDSO");
|
panic("Only support host mapping vDSO");
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int get_free_area(struct process_vm *vm, size_t len, intptr_t hint,
|
static int get_free_area(struct process_vm *vm, size_t len, intptr_t hint,
|
||||||
@@ -157,7 +155,7 @@ int arch_map_vdso(struct process_vm *vm)
|
|||||||
flag = VR_REMOTE | VR_PROT_READ;
|
flag = VR_REMOTE | VR_PROT_READ;
|
||||||
flag |= VRFLAG_PROT_TO_MAXPROT(flag);
|
flag |= VRFLAG_PROT_TO_MAXPROT(flag);
|
||||||
ret = add_process_memory_range(vm, start, end, vdso.vvar_phys, flag,
|
ret = add_process_memory_range(vm, start, end, vdso.vvar_phys, flag,
|
||||||
NULL, 0, PAGE_SHIFT, &range);
|
NULL, 0, PAGE_SHIFT, NULL, &range);
|
||||||
if (ret != 0){
|
if (ret != 0){
|
||||||
dkprintf("ERROR: adding memory range for tod_data\n");
|
dkprintf("ERROR: adding memory range for tod_data\n");
|
||||||
goto exit;
|
goto exit;
|
||||||
@@ -169,7 +167,7 @@ int arch_map_vdso(struct process_vm *vm)
|
|||||||
flag = VR_REMOTE | VR_PROT_READ | VR_PROT_EXEC;
|
flag = VR_REMOTE | VR_PROT_READ | VR_PROT_EXEC;
|
||||||
flag |= VRFLAG_PROT_TO_MAXPROT(flag);
|
flag |= VRFLAG_PROT_TO_MAXPROT(flag);
|
||||||
ret = add_process_memory_range(vm, start, end, vdso.vdso_physlist[0], flag,
|
ret = add_process_memory_range(vm, start, end, vdso.vdso_physlist[0], flag,
|
||||||
NULL, 0, PAGE_SHIFT, &range);
|
NULL, 0, PAGE_SHIFT, NULL, &range);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
dkprintf("ERROR: adding memory range for vdso_text\n");
|
dkprintf("ERROR: adding memory range for vdso_text\n");
|
||||||
|
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ extern char data_start[], data_end[];
|
|||||||
#define LARGE_PAGE_MASK (~((unsigned long)LARGE_PAGE_SIZE - 1))
|
#define LARGE_PAGE_MASK (~((unsigned long)LARGE_PAGE_SIZE - 1))
|
||||||
|
|
||||||
#define MAP_ST_START 0xffff800000000000UL
|
#define MAP_ST_START 0xffff800000000000UL
|
||||||
#define MAP_KERNEL_START 0xffffffff80000000UL
|
/* MAP_KERNEL_START is defined by cmake */
|
||||||
|
|
||||||
#define PTL4_SHIFT 39
|
#define PTL4_SHIFT 39
|
||||||
#define PTL3_SHIFT 30
|
#define PTL3_SHIFT 30
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
/* coredump.c COPYRIGHT FUJITSU LIMITED 2018 */
|
/* coredump.c COPYRIGHT FUJITSU LIMITED 2018-2019 */
|
||||||
#include <process.h>
|
#include <process.h>
|
||||||
#include <elfcore.h>
|
#include <elfcore.h>
|
||||||
|
|
||||||
void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread, void *regs0)
|
void arch_fill_prstatus(struct elf_prstatus64 *prstatus,
|
||||||
|
struct thread *thread, void *regs0, int sig)
|
||||||
{
|
{
|
||||||
struct x86_user_context *uctx = regs0;
|
struct x86_user_context *uctx = regs0;
|
||||||
struct x86_basic_regs *regs = &uctx->gpr;
|
struct x86_basic_regs *regs = &uctx->gpr;
|
||||||
@@ -18,8 +19,6 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
|
|||||||
short int pr_cursig;
|
short int pr_cursig;
|
||||||
a8_uint64_t pr_sigpend;
|
a8_uint64_t pr_sigpend;
|
||||||
a8_uint64_t pr_sighold;
|
a8_uint64_t pr_sighold;
|
||||||
pid_t pr_pid;
|
|
||||||
pid_t pr_ppid;
|
|
||||||
pid_t pr_pgrp;
|
pid_t pr_pgrp;
|
||||||
pid_t pr_sid;
|
pid_t pr_sid;
|
||||||
struct prstatus64_timeval pr_utime;
|
struct prstatus64_timeval pr_utime;
|
||||||
@@ -28,6 +27,14 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
|
|||||||
struct prstatus64_timeval pr_cstime;
|
struct prstatus64_timeval pr_cstime;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
prstatus->pr_pid = thread->tid;
|
||||||
|
if (thread->proc->parent) {
|
||||||
|
prstatus->pr_ppid = thread->proc->parent->pid;
|
||||||
|
}
|
||||||
|
|
||||||
|
prstatus->pr_info.si_signo = sig;
|
||||||
|
prstatus->pr_cursig = sig;
|
||||||
|
|
||||||
prstatus->pr_reg[0] = _r15;
|
prstatus->pr_reg[0] = _r15;
|
||||||
prstatus->pr_reg[1] = _r14;
|
prstatus->pr_reg[1] = _r14;
|
||||||
prstatus->pr_reg[2] = _r13;
|
prstatus->pr_reg[2] = _r13;
|
||||||
@@ -55,3 +62,13 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
|
|||||||
|
|
||||||
prstatus->pr_fpvalid = 0; /* We assume no fp */
|
prstatus->pr_fpvalid = 0; /* We assume no fp */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void arch_fill_thread_core_info(struct note *head,
|
||||||
|
struct thread *thread, void *regs)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
int arch_get_thread_core_info_size(void)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
/* cpu.c COPYRIGHT FUJITSU LIMITED 2018 */
|
/* cpu.c COPYRIGHT FUJITSU LIMITED 2018-2019 */
|
||||||
/**
|
/**
|
||||||
* \file cpu.c
|
* \file cpu.c
|
||||||
* License details are found in the file LICENSE.
|
* License details are found in the file LICENSE.
|
||||||
@@ -16,7 +16,6 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include <ihk/cpu.h>
|
#include <ihk/cpu.h>
|
||||||
#include <ihk/debug.h>
|
|
||||||
#include <ihk/mm.h>
|
#include <ihk/mm.h>
|
||||||
#include <types.h>
|
#include <types.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
@@ -32,7 +31,7 @@
|
|||||||
#include <prctl.h>
|
#include <prctl.h>
|
||||||
#include <page.h>
|
#include <page.h>
|
||||||
#include <kmalloc.h>
|
#include <kmalloc.h>
|
||||||
#include <debug.h>
|
#include <ihk/debug.h>
|
||||||
|
|
||||||
#define LAPIC_ID 0x020
|
#define LAPIC_ID 0x020
|
||||||
#define LAPIC_TIMER 0x320
|
#define LAPIC_TIMER 0x320
|
||||||
@@ -45,11 +44,9 @@
|
|||||||
#define LAPIC_ICR0 0x300
|
#define LAPIC_ICR0 0x300
|
||||||
#define LAPIC_ICR2 0x310
|
#define LAPIC_ICR2 0x310
|
||||||
#define LAPIC_ESR 0x280
|
#define LAPIC_ESR 0x280
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_75 /* x86 depend hide */
|
|
||||||
#define LOCAL_TIMER_VECTOR 0xef
|
#define LOCAL_TIMER_VECTOR 0xef
|
||||||
#define LOCAL_PERF_VECTOR 0xf0
|
#define LOCAL_PERF_VECTOR 0xf0
|
||||||
#define LOCAL_SMP_FUNC_CALL_VECTOR 0xf1
|
#define LOCAL_SMP_FUNC_CALL_VECTOR 0xf1
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_75 */
|
|
||||||
|
|
||||||
#define APIC_INT_LEVELTRIG 0x08000
|
#define APIC_INT_LEVELTRIG 0x08000
|
||||||
#define APIC_INT_ASSERT 0x04000
|
#define APIC_INT_ASSERT 0x04000
|
||||||
@@ -148,7 +145,7 @@ void reload_idt(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static struct list_head handlers[256 - 32];
|
static struct list_head handlers[256 - 32];
|
||||||
extern char nmi[];
|
extern char nmi_handler[];
|
||||||
extern char page_fault[], general_protection_exception[];
|
extern char page_fault[], general_protection_exception[];
|
||||||
extern char debug_exception[], int3_exception[];
|
extern char debug_exception[], int3_exception[];
|
||||||
|
|
||||||
@@ -175,7 +172,7 @@ static void init_idt(void)
|
|||||||
set_idt_entry(i, generic_common_handlers[i]);
|
set_idt_entry(i, generic_common_handlers[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
set_idt_entry(2, (uintptr_t)nmi);
|
set_idt_entry(2, (uintptr_t)nmi_handler);
|
||||||
set_idt_entry(13, (unsigned long)general_protection_exception);
|
set_idt_entry(13, (unsigned long)general_protection_exception);
|
||||||
set_idt_entry(14, (unsigned long)page_fault);
|
set_idt_entry(14, (unsigned long)page_fault);
|
||||||
|
|
||||||
@@ -955,6 +952,8 @@ void handle_interrupt(int vector, struct x86_user_context *regs)
|
|||||||
v->flags |= CPU_FLAG_NEED_RESCHED;
|
v->flags |= CPU_FLAG_NEED_RESCHED;
|
||||||
ihk_mc_spinlock_unlock(&v->runq_lock, irqstate);
|
ihk_mc_spinlock_unlock(&v->runq_lock, irqstate);
|
||||||
dkprintf("timer[%lu]: CPU_FLAG_NEED_RESCHED \n", rdtsc());
|
dkprintf("timer[%lu]: CPU_FLAG_NEED_RESCHED \n", rdtsc());
|
||||||
|
|
||||||
|
do_backlog();
|
||||||
}
|
}
|
||||||
else if (vector == LOCAL_PERF_VECTOR) {
|
else if (vector == LOCAL_PERF_VECTOR) {
|
||||||
struct siginfo info;
|
struct siginfo info;
|
||||||
@@ -1206,6 +1205,15 @@ unsigned long cpu_disable_interrupt_save(void)
|
|||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned long cpu_enable_interrupt_save(void)
|
||||||
|
{
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
|
asm volatile("pushf; pop %0; sti" : "=r"(flags) : : "memory", "cc");
|
||||||
|
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
/*@
|
/*@
|
||||||
@ behavior valid_vector:
|
@ behavior valid_vector:
|
||||||
@ assumes 32 <= vector <= 255;
|
@ assumes 32 <= vector <= 255;
|
||||||
@@ -1602,14 +1610,18 @@ int ihk_mc_arch_get_special_register(enum ihk_asr_type type,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*@
|
/*@
|
||||||
@ requires \valid_apicid(cpu); // valid APIC ID or not
|
@ requires \valid_cpuid(cpu); // valid CPU logical ID
|
||||||
@ ensures \result == 0
|
@ ensures \result == 0
|
||||||
@*/
|
@*/
|
||||||
int ihk_mc_interrupt_cpu(int cpu, int vector)
|
int ihk_mc_interrupt_cpu(int cpu, int vector)
|
||||||
{
|
{
|
||||||
|
if (cpu < 0 || cpu >= num_processors) {
|
||||||
|
kprintf("%s: invalid CPU id: %d\n", __func__, cpu);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
dkprintf("[%d] ihk_mc_interrupt_cpu: %d\n", ihk_mc_get_processor_id(), cpu);
|
dkprintf("[%d] ihk_mc_interrupt_cpu: %d\n", ihk_mc_get_processor_id(), cpu);
|
||||||
|
|
||||||
x86_issue_ipi(cpu, vector);
|
x86_issue_ipi(get_x86_cpu_local_variable(cpu)->apic_id, vector);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1624,6 +1636,7 @@ struct thread *arch_switch_context(struct thread *prev, struct thread *next)
|
|||||||
/* Set up new TLS.. */
|
/* Set up new TLS.. */
|
||||||
ihk_mc_init_user_tlsbase(next->uctx, next->tlsblock_base);
|
ihk_mc_init_user_tlsbase(next->uctx, next->tlsblock_base);
|
||||||
|
|
||||||
|
#ifdef ENABLE_PERF
|
||||||
/* Performance monitoring inherit */
|
/* Performance monitoring inherit */
|
||||||
if(next->proc->monitoring_event) {
|
if(next->proc->monitoring_event) {
|
||||||
if(next->proc->perf_status == PP_RESET)
|
if(next->proc->perf_status == PP_RESET)
|
||||||
@@ -1633,6 +1646,7 @@ struct thread *arch_switch_context(struct thread *prev, struct thread *next)
|
|||||||
perf_start(next->proc->monitoring_event);
|
perf_start(next->proc->monitoring_event);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef PROFILE_ENABLE
|
#ifdef PROFILE_ENABLE
|
||||||
if (prev && prev->profile && prev->profile_start_ts != 0) {
|
if (prev && prev->profile && prev->profile_start_ts != 0) {
|
||||||
@@ -1708,7 +1722,7 @@ check_and_allocate_fp_regs(struct thread *thread)
|
|||||||
|
|
||||||
if (!thread->fp_regs) {
|
if (!thread->fp_regs) {
|
||||||
kprintf("error: allocating fp_regs pages\n");
|
kprintf("error: allocating fp_regs pages\n");
|
||||||
result = 1;
|
result = -ENOMEM;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1721,12 +1735,14 @@ out:
|
|||||||
/*@
|
/*@
|
||||||
@ requires \valid(thread);
|
@ requires \valid(thread);
|
||||||
@*/
|
@*/
|
||||||
void
|
int
|
||||||
save_fp_regs(struct thread *thread)
|
save_fp_regs(struct thread *thread)
|
||||||
{
|
{
|
||||||
if (check_and_allocate_fp_regs(thread) != 0) {
|
int ret = 0;
|
||||||
// alloc error
|
|
||||||
return;
|
ret = check_and_allocate_fp_regs(thread);
|
||||||
|
if (ret) {
|
||||||
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (xsave_available) {
|
if (xsave_available) {
|
||||||
@@ -1741,13 +1757,23 @@ save_fp_regs(struct thread *thread)
|
|||||||
|
|
||||||
dkprintf("fp_regs for TID %d saved\n", thread->tid);
|
dkprintf("fp_regs for TID %d saved\n", thread->tid);
|
||||||
}
|
}
|
||||||
|
out:
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void copy_fp_regs(struct thread *from, struct thread *to)
|
int copy_fp_regs(struct thread *from, struct thread *to)
|
||||||
{
|
{
|
||||||
if ((from->fp_regs != NULL) && (check_and_allocate_fp_regs(to) == 0)) {
|
int ret = 0;
|
||||||
memcpy(to->fp_regs, from->fp_regs, sizeof(fp_regs_struct));
|
|
||||||
|
if (from->fp_regs != NULL) {
|
||||||
|
ret = check_and_allocate_fp_regs(to);
|
||||||
|
if (!ret) {
|
||||||
|
memcpy(to->fp_regs,
|
||||||
|
from->fp_regs,
|
||||||
|
sizeof(fp_regs_struct));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*@
|
/*@
|
||||||
@@ -1820,6 +1846,10 @@ ihk_mc_init_user_tlsbase(ihk_mc_user_context_t *ctx,
|
|||||||
do_arch_prctl(ARCH_SET_FS, tls_base_addr);
|
do_arch_prctl(ARCH_SET_FS, tls_base_addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void arch_flush_icache_all(void)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
/*@
|
/*@
|
||||||
@ assigns \nothing;
|
@ assigns \nothing;
|
||||||
@@ -1973,6 +2003,92 @@ mod_nmi_ctx(void *nmi_ctx, void (*func)())
|
|||||||
l[i++] = 0x28; // KERNEL DS
|
l[i++] = 0x28; // KERNEL DS
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void arch_save_panic_regs(void *irq_regs)
|
||||||
|
{
|
||||||
|
struct thread *current = cpu_local_var(current);
|
||||||
|
struct x86_user_context *regs =
|
||||||
|
(struct x86_user_context *)irq_regs;
|
||||||
|
struct x86_cpu_local_variables *x86v =
|
||||||
|
get_x86_cpu_local_variable(ihk_mc_get_processor_id());
|
||||||
|
struct segment_regs {
|
||||||
|
uint32_t rflags;
|
||||||
|
uint32_t cs;
|
||||||
|
uint32_t ss;
|
||||||
|
uint32_t ds;
|
||||||
|
uint32_t es;
|
||||||
|
uint32_t fs;
|
||||||
|
uint32_t gs;
|
||||||
|
} *sregs;
|
||||||
|
|
||||||
|
/* Kernel space? */
|
||||||
|
if (regs->gpr.rip > USER_END) {
|
||||||
|
x86v->panic_regs[0] = regs->gpr.rax;
|
||||||
|
x86v->panic_regs[1] = regs->gpr.rbx;
|
||||||
|
x86v->panic_regs[2] = regs->gpr.rcx;
|
||||||
|
x86v->panic_regs[3] = regs->gpr.rdx;
|
||||||
|
x86v->panic_regs[4] = regs->gpr.rsi;
|
||||||
|
x86v->panic_regs[5] = regs->gpr.rdi;
|
||||||
|
x86v->panic_regs[6] = regs->gpr.rbp;
|
||||||
|
x86v->panic_regs[7] = regs->gpr.rsp;
|
||||||
|
x86v->panic_regs[8] = regs->gpr.r8;
|
||||||
|
x86v->panic_regs[9] = regs->gpr.r9;
|
||||||
|
x86v->panic_regs[10] = regs->gpr.r10;
|
||||||
|
x86v->panic_regs[11] = regs->gpr.r11;
|
||||||
|
x86v->panic_regs[12] = regs->gpr.r12;
|
||||||
|
x86v->panic_regs[13] = regs->gpr.r13;
|
||||||
|
x86v->panic_regs[14] = regs->gpr.r14;
|
||||||
|
x86v->panic_regs[15] = regs->gpr.r15;
|
||||||
|
x86v->panic_regs[16] = regs->gpr.rip;
|
||||||
|
sregs = (struct segment_regs *)&x86v->panic_regs[17];
|
||||||
|
sregs->rflags = regs->gpr.rflags;
|
||||||
|
sregs->cs = regs->gpr.cs;
|
||||||
|
sregs->ss = regs->gpr.ss;
|
||||||
|
sregs->ds = regs->sr.ds;
|
||||||
|
sregs->es = regs->sr.es;
|
||||||
|
sregs->fs = regs->sr.fs;
|
||||||
|
sregs->gs = regs->sr.gs;
|
||||||
|
}
|
||||||
|
/* User-space, show kernel context */
|
||||||
|
else {
|
||||||
|
kprintf("%s: in user-space: %p\n", __func__, regs->gpr.rip);
|
||||||
|
x86v->panic_regs[0] = 0;
|
||||||
|
x86v->panic_regs[1] = current->ctx.rbx;
|
||||||
|
x86v->panic_regs[2] = 0;
|
||||||
|
x86v->panic_regs[3] = 0;
|
||||||
|
x86v->panic_regs[4] = current->ctx.rsi;
|
||||||
|
x86v->panic_regs[5] = current->ctx.rdi;
|
||||||
|
x86v->panic_regs[6] = current->ctx.rbp;
|
||||||
|
x86v->panic_regs[7] = current->ctx.rsp;
|
||||||
|
x86v->panic_regs[8] = 0;
|
||||||
|
x86v->panic_regs[9] = 0;
|
||||||
|
x86v->panic_regs[10] = 0;
|
||||||
|
x86v->panic_regs[11] = 0;
|
||||||
|
x86v->panic_regs[12] = regs->gpr.r12;
|
||||||
|
x86v->panic_regs[13] = regs->gpr.r13;
|
||||||
|
x86v->panic_regs[14] = regs->gpr.r14;
|
||||||
|
x86v->panic_regs[15] = regs->gpr.r15;
|
||||||
|
x86v->panic_regs[16] = (unsigned long)enter_user_mode;
|
||||||
|
sregs = (struct segment_regs *)&x86v->panic_regs[17];
|
||||||
|
sregs->rflags = regs->gpr.rflags;
|
||||||
|
sregs->cs = regs->gpr.cs;
|
||||||
|
sregs->ss = regs->gpr.ss;
|
||||||
|
sregs->ds = regs->sr.ds;
|
||||||
|
sregs->es = regs->sr.es;
|
||||||
|
sregs->fs = regs->sr.fs;
|
||||||
|
sregs->gs = regs->sr.gs;
|
||||||
|
}
|
||||||
|
|
||||||
|
x86v->paniced = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void arch_clear_panic(void)
|
||||||
|
{
|
||||||
|
struct x86_cpu_local_variables *x86v =
|
||||||
|
get_x86_cpu_local_variable(ihk_mc_get_processor_id());
|
||||||
|
|
||||||
|
x86v->paniced = 0;
|
||||||
|
}
|
||||||
|
|
||||||
int arch_cpu_read_write_register(
|
int arch_cpu_read_write_register(
|
||||||
struct ihk_os_cpu_register *desc,
|
struct ihk_os_cpu_register *desc,
|
||||||
enum mcctrl_os_cpu_operation op)
|
enum mcctrl_os_cpu_operation op)
|
||||||
@@ -2096,9 +2212,7 @@ int smp_call_func(cpu_set_t *__cpu_set, smp_func_t __func, void *__arg)
|
|||||||
ihk_mc_spinlock_unlock(&get_cpu_local_var(cpu)->smp_func_req_lock,
|
ihk_mc_spinlock_unlock(&get_cpu_local_var(cpu)->smp_func_req_lock,
|
||||||
irq_flags);
|
irq_flags);
|
||||||
|
|
||||||
ihk_mc_interrupt_cpu(
|
ihk_mc_interrupt_cpu(cpu, LOCAL_SMP_FUNC_CALL_VECTOR);
|
||||||
get_x86_cpu_local_variable(cpu)->apic_id,
|
|
||||||
LOCAL_SMP_FUNC_CALL_VECTOR);
|
|
||||||
|
|
||||||
++cpu_index;
|
++cpu_index;
|
||||||
}
|
}
|
||||||
@@ -2130,4 +2244,48 @@ free_out:
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern int nmi_mode;
|
||||||
|
extern long freeze_thaw(void *nmi_ctx);
|
||||||
|
|
||||||
|
void multi_nm_interrupt_handler(void *irq_regs)
|
||||||
|
{
|
||||||
|
dkprintf("%s: ...\n", __func__);
|
||||||
|
switch (nmi_mode) {
|
||||||
|
case 1:
|
||||||
|
case 2:
|
||||||
|
/* mode == 1 or 2, for FREEZER NMI */
|
||||||
|
dkprintf("%s: freeze mode NMI catch. (nmi_mode=%d)\n",
|
||||||
|
__func__, nmi_mode);
|
||||||
|
freeze_thaw(NULL);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 0:
|
||||||
|
/* mode == 0, for MEMDUMP NMI */
|
||||||
|
arch_save_panic_regs(irq_regs);
|
||||||
|
ihk_mc_query_mem_areas();
|
||||||
|
/* memdump-nmi is halted McKernel, break is unnecessary. */
|
||||||
|
/* fall through */
|
||||||
|
case 3:
|
||||||
|
/* mode == 3, for SHUTDOWN-WAIT NMI */
|
||||||
|
kprintf("%s: STOP\n", __func__);
|
||||||
|
while (nmi_mode != 4)
|
||||||
|
cpu_halt();
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 4:
|
||||||
|
/* mode == 4, continue NMI */
|
||||||
|
arch_clear_panic();
|
||||||
|
if (!ihk_mc_get_processor_id()) {
|
||||||
|
ihk_mc_clear_dump_page_completion();
|
||||||
|
}
|
||||||
|
kprintf("%s: RESUME, nmi_mode: %d\n", __func__, nmi_mode);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
ekprintf("%s: Unknown nmi-mode(%d) detected.\n",
|
||||||
|
__func__, nmi_mode);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*** end of file ***/
|
/*** end of file ***/
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ extern void preempt_disable(void);
|
|||||||
|
|
||||||
#define IHK_STATIC_SPINLOCK_FUNCS
|
#define IHK_STATIC_SPINLOCK_FUNCS
|
||||||
|
|
||||||
static void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
|
static inline void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
|
||||||
{
|
{
|
||||||
lock->head_tail = 0;
|
lock->head_tail = 0;
|
||||||
}
|
}
|
||||||
@@ -50,10 +50,13 @@ rc = __ihk_mc_spinlock_trylock_noirq(l); \
|
|||||||
#define ihk_mc_spinlock_trylock_noirq __ihk_mc_spinlock_trylock_noirq
|
#define ihk_mc_spinlock_trylock_noirq __ihk_mc_spinlock_trylock_noirq
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static int __ihk_mc_spinlock_trylock_noirq(ihk_spinlock_t *lock)
|
static inline int __ihk_mc_spinlock_trylock_noirq(ihk_spinlock_t *lock)
|
||||||
{
|
{
|
||||||
ihk_spinlock_t cur = { .head_tail = lock->head_tail };
|
ihk_spinlock_t cur = { .head_tail = lock->head_tail };
|
||||||
ihk_spinlock_t next = { .tickets.head = cur.tickets.head, .tickets.tail = cur.tickets.tail + 2 };
|
ihk_spinlock_t next = { .tickets = {
|
||||||
|
.head = cur.tickets.head,
|
||||||
|
.tail = cur.tickets.tail + 2
|
||||||
|
} };
|
||||||
int success;
|
int success;
|
||||||
|
|
||||||
if (cur.tickets.head != cur.tickets.tail) {
|
if (cur.tickets.head != cur.tickets.tail) {
|
||||||
@@ -80,7 +83,8 @@ __kprintf("[%d] ret ihk_mc_spinlock_trylock\n", ihk_mc_get_processor_id()); rc;\
|
|||||||
#else
|
#else
|
||||||
#define ihk_mc_spinlock_trylock __ihk_mc_spinlock_trylock
|
#define ihk_mc_spinlock_trylock __ihk_mc_spinlock_trylock
|
||||||
#endif
|
#endif
|
||||||
static unsigned long __ihk_mc_spinlock_trylock(ihk_spinlock_t *lock, int *result)
|
static inline unsigned long __ihk_mc_spinlock_trylock(ihk_spinlock_t *lock,
|
||||||
|
int *result)
|
||||||
{
|
{
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
|
||||||
@@ -101,7 +105,7 @@ __kprintf("[%d] ret ihk_mc_spinlock_lock_noirq\n", ihk_mc_get_processor_id()); \
|
|||||||
#define ihk_mc_spinlock_lock_noirq __ihk_mc_spinlock_lock_noirq
|
#define ihk_mc_spinlock_lock_noirq __ihk_mc_spinlock_lock_noirq
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void __ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
|
static inline void __ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
|
||||||
{
|
{
|
||||||
register struct __raw_tickets inc = { .tail = 0x0002 };
|
register struct __raw_tickets inc = { .tail = 0x0002 };
|
||||||
|
|
||||||
@@ -132,7 +136,7 @@ __kprintf("[%d] ret ihk_mc_spinlock_lock\n", ihk_mc_get_processor_id()); rc;\
|
|||||||
#else
|
#else
|
||||||
#define ihk_mc_spinlock_lock __ihk_mc_spinlock_lock
|
#define ihk_mc_spinlock_lock __ihk_mc_spinlock_lock
|
||||||
#endif
|
#endif
|
||||||
static unsigned long __ihk_mc_spinlock_lock(ihk_spinlock_t *lock)
|
static inline unsigned long __ihk_mc_spinlock_lock(ihk_spinlock_t *lock)
|
||||||
{
|
{
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
|
||||||
@@ -152,7 +156,7 @@ __kprintf("[%d] ret ihk_mc_spinlock_unlock_noirq\n", ihk_mc_get_processor_id());
|
|||||||
#else
|
#else
|
||||||
#define ihk_mc_spinlock_unlock_noirq __ihk_mc_spinlock_unlock_noirq
|
#define ihk_mc_spinlock_unlock_noirq __ihk_mc_spinlock_unlock_noirq
|
||||||
#endif
|
#endif
|
||||||
static void __ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
|
static inline void __ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
|
||||||
{
|
{
|
||||||
__ticket_t inc = 0x0002;
|
__ticket_t inc = 0x0002;
|
||||||
|
|
||||||
@@ -171,100 +175,14 @@ __kprintf("[%d] ret ihk_mc_spinlock_unlock\n", ihk_mc_get_processor_id()); \
|
|||||||
#else
|
#else
|
||||||
#define ihk_mc_spinlock_unlock __ihk_mc_spinlock_unlock
|
#define ihk_mc_spinlock_unlock __ihk_mc_spinlock_unlock
|
||||||
#endif
|
#endif
|
||||||
static void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
|
static inline void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock,
|
||||||
|
unsigned long flags)
|
||||||
{
|
{
|
||||||
__ihk_mc_spinlock_unlock_noirq(lock);
|
__ihk_mc_spinlock_unlock_noirq(lock);
|
||||||
|
|
||||||
cpu_restore_interrupt(flags);
|
cpu_restore_interrupt(flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* An implementation of the Mellor-Crummey Scott (MCS) lock */
|
|
||||||
typedef struct mcs_lock_node {
|
|
||||||
unsigned long locked;
|
|
||||||
struct mcs_lock_node *next;
|
|
||||||
unsigned long irqsave;
|
|
||||||
#ifndef ENABLE_UBSAN
|
|
||||||
} __aligned(64) mcs_lock_node_t;
|
|
||||||
#else
|
|
||||||
} mcs_lock_node_t;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef mcs_lock_node_t mcs_lock_t;
|
|
||||||
|
|
||||||
static void mcs_lock_init(struct mcs_lock_node *node)
|
|
||||||
{
|
|
||||||
node->locked = 0;
|
|
||||||
node->next = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void __mcs_lock_lock(struct mcs_lock_node *lock,
|
|
||||||
struct mcs_lock_node *node)
|
|
||||||
{
|
|
||||||
struct mcs_lock_node *pred;
|
|
||||||
|
|
||||||
node->next = NULL;
|
|
||||||
node->locked = 0;
|
|
||||||
pred = (struct mcs_lock_node *)xchg8((unsigned long *)&lock->next,
|
|
||||||
(unsigned long)node);
|
|
||||||
|
|
||||||
if (pred) {
|
|
||||||
node->locked = 1;
|
|
||||||
pred->next = node;
|
|
||||||
while (node->locked != 0) {
|
|
||||||
cpu_pause();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void __mcs_lock_unlock(struct mcs_lock_node *lock,
|
|
||||||
struct mcs_lock_node *node)
|
|
||||||
{
|
|
||||||
if (node->next == NULL) {
|
|
||||||
struct mcs_lock_node *old = (struct mcs_lock_node *)
|
|
||||||
atomic_cmpxchg8((unsigned long *)&lock->next,
|
|
||||||
(unsigned long)node, (unsigned long)0);
|
|
||||||
|
|
||||||
if (old == node) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
while (node->next == NULL) {
|
|
||||||
cpu_pause();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
node->next->locked = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mcs_lock_lock_noirq(struct mcs_lock_node *lock,
|
|
||||||
struct mcs_lock_node *node)
|
|
||||||
{
|
|
||||||
preempt_disable();
|
|
||||||
__mcs_lock_lock(lock, node);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mcs_lock_unlock_noirq(struct mcs_lock_node *lock,
|
|
||||||
struct mcs_lock_node *node)
|
|
||||||
{
|
|
||||||
__mcs_lock_unlock(lock, node);
|
|
||||||
preempt_enable();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mcs_lock_lock(struct mcs_lock_node *lock,
|
|
||||||
struct mcs_lock_node *node)
|
|
||||||
{
|
|
||||||
node->irqsave = cpu_disable_interrupt_save();
|
|
||||||
mcs_lock_lock_noirq(lock, node);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mcs_lock_unlock(struct mcs_lock_node *lock,
|
|
||||||
struct mcs_lock_node *node)
|
|
||||||
{
|
|
||||||
mcs_lock_unlock_noirq(lock, node);
|
|
||||||
cpu_restore_interrupt(node->irqsave);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#define SPINLOCK_IN_MCS_RWLOCK
|
#define SPINLOCK_IN_MCS_RWLOCK
|
||||||
|
|
||||||
// reader/writer lock
|
// reader/writer lock
|
||||||
@@ -310,7 +228,7 @@ typedef struct mcs_rwlock_lock {
|
|||||||
} mcs_rwlock_lock_t;
|
} mcs_rwlock_lock_t;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void
|
static inline void
|
||||||
mcs_rwlock_init(struct mcs_rwlock_lock *lock)
|
mcs_rwlock_init(struct mcs_rwlock_lock *lock)
|
||||||
{
|
{
|
||||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||||
@@ -331,7 +249,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_lock_noirq\n", ihk_mc_get_processor_id());
|
|||||||
#else
|
#else
|
||||||
#define mcs_rwlock_writer_lock_noirq __mcs_rwlock_writer_lock_noirq
|
#define mcs_rwlock_writer_lock_noirq __mcs_rwlock_writer_lock_noirq
|
||||||
#endif
|
#endif
|
||||||
static void
|
static inline void
|
||||||
__mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
__mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||||
{
|
{
|
||||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||||
@@ -358,7 +276,7 @@ __mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_n
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifndef SPINLOCK_IN_MCS_RWLOCK
|
#ifndef SPINLOCK_IN_MCS_RWLOCK
|
||||||
static void
|
static inline void
|
||||||
mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
|
mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
|
||||||
{
|
{
|
||||||
struct mcs_rwlock_node *p;
|
struct mcs_rwlock_node *p;
|
||||||
@@ -425,7 +343,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_unlock_noirq\n", ihk_mc_get_processor_id()
|
|||||||
#else
|
#else
|
||||||
#define mcs_rwlock_writer_unlock_noirq __mcs_rwlock_writer_unlock_noirq
|
#define mcs_rwlock_writer_unlock_noirq __mcs_rwlock_writer_unlock_noirq
|
||||||
#endif
|
#endif
|
||||||
static void
|
static inline void
|
||||||
__mcs_rwlock_writer_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
__mcs_rwlock_writer_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||||
{
|
{
|
||||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||||
@@ -485,7 +403,7 @@ atomic_inc_ifnot0(ihk_atomic_t *v)
|
|||||||
return old;
|
return old;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static inline void
|
||||||
__mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
__mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||||
{
|
{
|
||||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||||
@@ -551,7 +469,7 @@ __kprintf("[%d] ret mcs_rwlock_reader_unlock_noirq\n", ihk_mc_get_processor_id()
|
|||||||
#else
|
#else
|
||||||
#define mcs_rwlock_reader_unlock_noirq __mcs_rwlock_reader_unlock_noirq
|
#define mcs_rwlock_reader_unlock_noirq __mcs_rwlock_reader_unlock_noirq
|
||||||
#endif
|
#endif
|
||||||
static void
|
static inline void
|
||||||
__mcs_rwlock_reader_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
__mcs_rwlock_reader_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||||
{
|
{
|
||||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||||
@@ -598,7 +516,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_lock\n", ihk_mc_get_processor_id()); \
|
|||||||
#else
|
#else
|
||||||
#define mcs_rwlock_writer_lock __mcs_rwlock_writer_lock
|
#define mcs_rwlock_writer_lock __mcs_rwlock_writer_lock
|
||||||
#endif
|
#endif
|
||||||
static void
|
static inline void
|
||||||
__mcs_rwlock_writer_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
__mcs_rwlock_writer_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||||
{
|
{
|
||||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||||
@@ -618,7 +536,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_unlock\n", ihk_mc_get_processor_id()); \
|
|||||||
#else
|
#else
|
||||||
#define mcs_rwlock_writer_unlock __mcs_rwlock_writer_unlock
|
#define mcs_rwlock_writer_unlock __mcs_rwlock_writer_unlock
|
||||||
#endif
|
#endif
|
||||||
static void
|
static inline void
|
||||||
__mcs_rwlock_writer_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
__mcs_rwlock_writer_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||||
{
|
{
|
||||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||||
@@ -638,7 +556,7 @@ __kprintf("[%d] ret mcs_rwlock_reader_lock\n", ihk_mc_get_processor_id()); \
|
|||||||
#else
|
#else
|
||||||
#define mcs_rwlock_reader_lock __mcs_rwlock_reader_lock
|
#define mcs_rwlock_reader_lock __mcs_rwlock_reader_lock
|
||||||
#endif
|
#endif
|
||||||
static void
|
static inline void
|
||||||
__mcs_rwlock_reader_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
__mcs_rwlock_reader_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||||
{
|
{
|
||||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||||
@@ -658,7 +576,7 @@ __kprintf("[%d] ret mcs_rwlock_reader_unlock\n", ihk_mc_get_processor_id()); \
|
|||||||
#else
|
#else
|
||||||
#define mcs_rwlock_reader_unlock __mcs_rwlock_reader_unlock
|
#define mcs_rwlock_reader_unlock __mcs_rwlock_reader_unlock
|
||||||
#endif
|
#endif
|
||||||
static void
|
static inline void
|
||||||
__mcs_rwlock_reader_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
__mcs_rwlock_reader_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||||
{
|
{
|
||||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||||
@@ -674,4 +592,90 @@ static inline int irqflags_can_interrupt(unsigned long flags)
|
|||||||
return !!(flags & 0x200);
|
return !!(flags & 0x200);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ihk_rwlock {
|
||||||
|
union {
|
||||||
|
long lock;
|
||||||
|
struct {
|
||||||
|
unsigned int read;
|
||||||
|
int write;
|
||||||
|
};
|
||||||
|
} lock;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline void ihk_mc_rwlock_init(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
rw->lock.read = 0;
|
||||||
|
rw->lock.write = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void ihk_mc_read_lock(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
asm volatile("1:\t"
|
||||||
|
"lock; decq %0\n\t"
|
||||||
|
"jns 3f\n\t"
|
||||||
|
"lock incq %0\n\t"
|
||||||
|
"2:\t"
|
||||||
|
"pause\n\t"
|
||||||
|
"cmpq $0x1, %0\n\t"
|
||||||
|
"jns 1b\n\t"
|
||||||
|
"jmp 2b\n\t"
|
||||||
|
"3:"
|
||||||
|
: "+m" (rw->lock.lock) : : "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void ihk_mc_write_lock(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
asm volatile("1:\t"
|
||||||
|
"lock; decl %0\n\t"
|
||||||
|
"je 3f\n\t"
|
||||||
|
"lock; incl %0\n\t"
|
||||||
|
"2:\t"
|
||||||
|
"pause\n\t"
|
||||||
|
"cmpl $0x1,%0\n\t"
|
||||||
|
"je 1b\n\t"
|
||||||
|
"jmp 2b\n\t"
|
||||||
|
"3:"
|
||||||
|
: "+m" (rw->lock.write) : "i" (((1L) << 32)) : "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int ihk_mc_read_trylock(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
ihk_atomic64_t *count = (ihk_atomic64_t *)rw;
|
||||||
|
|
||||||
|
if (ihk_atomic64_sub_return(1, count) >= 0)
|
||||||
|
return 1;
|
||||||
|
ihk_atomic64_inc(count);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int ihk_mc_write_trylock(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
ihk_atomic_t *count = (ihk_atomic_t *)&rw->lock.write;
|
||||||
|
|
||||||
|
if (ihk_atomic_dec_and_test(count))
|
||||||
|
return 1;
|
||||||
|
ihk_atomic_inc(count);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void ihk_mc_read_unlock(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
asm volatile("lock; incq %0" : "+m" (rw->lock.lock) : : "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void ihk_mc_write_unlock(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
asm volatile("lock; incl %0"
|
||||||
|
: "+m" (rw->lock.write) : "i" (((1L) << 32)) : "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int ihk_mc_write_can_lock(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
return rw->lock.write == 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int ihk_mc_read_can_lock(struct ihk_rwlock *rw)
|
||||||
|
{
|
||||||
|
return rw->lock.lock > 0;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
#define __HEADER_X86_COMMON_ARCH_MEMORY_H
|
#define __HEADER_X86_COMMON_ARCH_MEMORY_H
|
||||||
|
|
||||||
#include <ihk/types.h>
|
#include <ihk/types.h>
|
||||||
|
#include <errno.h>
|
||||||
|
|
||||||
#define KERNEL_CS_ENTRY 4
|
#define KERNEL_CS_ENTRY 4
|
||||||
#define KERNEL_DS_ENTRY 5
|
#define KERNEL_DS_ENTRY 5
|
||||||
@@ -66,8 +67,8 @@
|
|||||||
* Placing the LWK image in the virtual address space at the end of
|
* Placing the LWK image in the virtual address space at the end of
|
||||||
* the Linux modules section enables us to map the LWK TEXT in Linux
|
* the Linux modules section enables us to map the LWK TEXT in Linux
|
||||||
* as well, so that Linux can also call into LWK text.
|
* as well, so that Linux can also call into LWK text.
|
||||||
|
* It's defined by cmake.
|
||||||
*/
|
*/
|
||||||
#define MAP_KERNEL_START 0xFFFFFFFFFE800000UL
|
|
||||||
#define STACK_TOP(region) ((region)->user_end)
|
#define STACK_TOP(region) ((region)->user_end)
|
||||||
|
|
||||||
#define MAP_VMAP_SIZE 0x0000000100000000UL
|
#define MAP_VMAP_SIZE 0x0000000100000000UL
|
||||||
@@ -183,12 +184,10 @@ enum ihk_mc_pt_attribute {
|
|||||||
|
|
||||||
enum ihk_mc_pt_attribute attr_mask;
|
enum ihk_mc_pt_attribute attr_mask;
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_12
|
|
||||||
static inline int pfn_is_write_combined(uintptr_t pfn)
|
static inline int pfn_is_write_combined(uintptr_t pfn)
|
||||||
{
|
{
|
||||||
return ((pfn & PFL1_PWT) && !(pfn & PFL1_PCD));
|
return ((pfn & PFL1_PWT) && !(pfn & PFL1_PCD));
|
||||||
}
|
}
|
||||||
#endif /* #ifdef POSTK_DEBUG_ARCH_DEP_12 */
|
|
||||||
|
|
||||||
static inline int pte_is_null(pte_t *ptep)
|
static inline int pte_is_null(pte_t *ptep)
|
||||||
{
|
{
|
||||||
@@ -365,6 +364,17 @@ static inline int pgsize_to_tbllv(size_t pgsize)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int pgsize_to_pgshift(size_t pgsize)
|
||||||
|
{
|
||||||
|
switch (pgsize) {
|
||||||
|
case PTL1_SIZE: return PTL1_SHIFT;
|
||||||
|
case PTL2_SIZE: return PTL2_SHIFT;
|
||||||
|
case PTL3_SIZE: return PTL3_SHIFT;
|
||||||
|
case PTL4_SIZE: return PTL4_SHIFT;
|
||||||
|
default: return -EINVAL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static inline size_t tbllv_to_pgsize(int level)
|
static inline size_t tbllv_to_pgsize(int level)
|
||||||
{
|
{
|
||||||
switch (level) {
|
switch (level) {
|
||||||
|
|||||||
@@ -13,19 +13,17 @@
|
|||||||
#ifndef ARCH_CPU_H
|
#ifndef ARCH_CPU_H
|
||||||
#define ARCH_CPU_H
|
#define ARCH_CPU_H
|
||||||
|
|
||||||
|
#define mb() asm volatile("mfence":::"memory")
|
||||||
|
#define rmb() asm volatile("lfence":::"memory")
|
||||||
|
#define wmb() asm volatile("sfence" ::: "memory")
|
||||||
|
|
||||||
|
#define smp_mb() mb()
|
||||||
|
#define smp_rmb() rmb()
|
||||||
|
#define smp_wmb() barrier()
|
||||||
|
|
||||||
#define arch_barrier() asm volatile("" : : : "memory")
|
#define arch_barrier() asm volatile("" : : : "memory")
|
||||||
|
|
||||||
static inline void rmb(void)
|
static inline unsigned long read_tsc(void)
|
||||||
{
|
|
||||||
arch_barrier();
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void wmb(void)
|
|
||||||
{
|
|
||||||
arch_barrier();
|
|
||||||
}
|
|
||||||
|
|
||||||
static unsigned long read_tsc(void)
|
|
||||||
{
|
{
|
||||||
unsigned int low, high;
|
unsigned int low, high;
|
||||||
|
|
||||||
@@ -34,4 +32,21 @@ static unsigned long read_tsc(void)
|
|||||||
return (low | ((unsigned long)high << 32));
|
return (low | ((unsigned long)high << 32));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define smp_load_acquire(p) \
|
||||||
|
({ \
|
||||||
|
typeof(*p) ___p1 = ACCESS_ONCE(*p); \
|
||||||
|
compiletime_assert_atomic_type(*p); \
|
||||||
|
barrier(); \
|
||||||
|
___p1; \
|
||||||
|
})
|
||||||
|
|
||||||
|
#define smp_store_release(p, v) \
|
||||||
|
({ \
|
||||||
|
compiletime_assert_atomic_type(*p); \
|
||||||
|
barrier(); \
|
||||||
|
WRITE_ONCE(*p, v); \
|
||||||
|
})
|
||||||
|
|
||||||
|
void arch_flush_icache_all(void);
|
||||||
|
|
||||||
#endif /* ARCH_CPU_H */
|
#endif /* ARCH_CPU_H */
|
||||||
|
|||||||
@@ -1,32 +0,0 @@
|
|||||||
#ifndef ARCH_RUSAGE_H_INCLUDED
|
|
||||||
#define ARCH_RUSAGE_H_INCLUDED
|
|
||||||
|
|
||||||
#define DEBUG_RUSAGE
|
|
||||||
|
|
||||||
#define IHK_OS_PGSIZE_4KB 0
|
|
||||||
#define IHK_OS_PGSIZE_2MB 1
|
|
||||||
#define IHK_OS_PGSIZE_1GB 2
|
|
||||||
|
|
||||||
extern struct rusage_global rusage;
|
|
||||||
|
|
||||||
static inline int rusage_pgsize_to_pgtype(size_t pgsize)
|
|
||||||
{
|
|
||||||
int ret = IHK_OS_PGSIZE_4KB;
|
|
||||||
switch (pgsize) {
|
|
||||||
case PTL1_SIZE:
|
|
||||||
ret = IHK_OS_PGSIZE_4KB;
|
|
||||||
break;
|
|
||||||
case PTL2_SIZE:
|
|
||||||
ret = IHK_OS_PGSIZE_2MB;
|
|
||||||
break;
|
|
||||||
case PTL3_SIZE:
|
|
||||||
ret = IHK_OS_PGSIZE_1GB;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
kprintf("%s: Error: Unknown pgsize=%ld\n", __FUNCTION__, pgsize);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* !defined(ARCH_RUSAGE_H_INCLUDED) */
|
|
||||||
@@ -13,6 +13,8 @@
|
|||||||
#ifndef HEADER_X86_COMMON_IHK_ATOMIC_H
|
#ifndef HEADER_X86_COMMON_IHK_ATOMIC_H
|
||||||
#define HEADER_X86_COMMON_IHK_ATOMIC_H
|
#define HEADER_X86_COMMON_IHK_ATOMIC_H
|
||||||
|
|
||||||
|
#include <lwk/compiler.h>
|
||||||
|
|
||||||
/***********************************************************************
|
/***********************************************************************
|
||||||
* ihk_atomic_t
|
* ihk_atomic_t
|
||||||
*/
|
*/
|
||||||
@@ -114,7 +116,7 @@ static inline long ihk_atomic64_read(const ihk_atomic64_t *v)
|
|||||||
return *(volatile long *)&(v)->counter64;
|
return *(volatile long *)&(v)->counter64;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void ihk_atomic64_set(ihk_atomic64_t *v, int i)
|
static inline void ihk_atomic64_set(ihk_atomic64_t *v, long i)
|
||||||
{
|
{
|
||||||
v->counter64 = i;
|
v->counter64 = i;
|
||||||
}
|
}
|
||||||
@@ -124,6 +126,22 @@ static inline void ihk_atomic64_inc(ihk_atomic64_t *v)
|
|||||||
asm volatile ("lock incq %0" : "+m"(v->counter64));
|
asm volatile ("lock incq %0" : "+m"(v->counter64));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline long ihk_atomic64_add_return(long i, ihk_atomic64_t *v)
|
||||||
|
{
|
||||||
|
long __i;
|
||||||
|
|
||||||
|
__i = i;
|
||||||
|
asm volatile("lock xaddq %0, %1"
|
||||||
|
: "+r" (i), "+m" (v->counter64)
|
||||||
|
: : "memory");
|
||||||
|
return i + __i;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline long ihk_atomic64_sub_return(long i, ihk_atomic64_t *v)
|
||||||
|
{
|
||||||
|
return ihk_atomic64_add_return(-i, v);
|
||||||
|
}
|
||||||
|
|
||||||
/***********************************************************************
|
/***********************************************************************
|
||||||
* others
|
* others
|
||||||
*/
|
*/
|
||||||
@@ -156,43 +174,55 @@ static inline unsigned long xchg8(unsigned long *ptr, unsigned long x)
|
|||||||
return __x;
|
return __x;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define __xchg(x, ptr, size) \
|
#define __X86_CASE_B 1
|
||||||
({ \
|
#define __X86_CASE_W 2
|
||||||
__typeof(*(ptr)) __x = (x); \
|
#define __X86_CASE_L 4
|
||||||
switch (size) { \
|
#define __X86_CASE_Q 8
|
||||||
case 1: \
|
|
||||||
asm volatile("xchgb %b0,%1" \
|
|
||||||
: "=q" (__x) \
|
|
||||||
: "m" (*__xg(ptr)), "0" (__x) \
|
|
||||||
: "memory"); \
|
|
||||||
break; \
|
|
||||||
case 2: \
|
|
||||||
asm volatile("xchgw %w0,%1" \
|
|
||||||
: "=r" (__x) \
|
|
||||||
: "m" (*__xg(ptr)), "0" (__x) \
|
|
||||||
: "memory"); \
|
|
||||||
break; \
|
|
||||||
case 4: \
|
|
||||||
asm volatile("xchgl %k0,%1" \
|
|
||||||
: "=r" (__x) \
|
|
||||||
: "m" (*__xg(ptr)), "0" (__x) \
|
|
||||||
: "memory"); \
|
|
||||||
break; \
|
|
||||||
case 8: \
|
|
||||||
asm volatile("xchgq %0,%1" \
|
|
||||||
: "=r" (__x) \
|
|
||||||
: "m" (*__xg(ptr)), "0" (__x) \
|
|
||||||
: "memory"); \
|
|
||||||
break; \
|
|
||||||
default: \
|
|
||||||
panic("xchg for wrong size"); \
|
|
||||||
} \
|
|
||||||
__x; \
|
|
||||||
})
|
|
||||||
|
|
||||||
|
extern void __xchg_wrong_size(void)
|
||||||
|
__compiletime_error("Bad argument size for xchg");
|
||||||
|
|
||||||
#define xchg(ptr, v) \
|
/*
|
||||||
__xchg((v), (ptr), sizeof(*ptr))
|
* An exchange-type operation, which takes a value and a pointer, and
|
||||||
|
* returns the old value.
|
||||||
|
*/
|
||||||
|
#define __xchg_op(ptr, arg, op, lock) \
|
||||||
|
({ \
|
||||||
|
__typeof__(*(ptr)) __ret = (arg); \
|
||||||
|
switch (sizeof(*(ptr))) { \
|
||||||
|
case __X86_CASE_B: \
|
||||||
|
asm volatile (lock #op "b %b0, %1\n" \
|
||||||
|
: "+q" (__ret), "+m" (*(ptr)) \
|
||||||
|
: : "memory", "cc"); \
|
||||||
|
break; \
|
||||||
|
case __X86_CASE_W: \
|
||||||
|
asm volatile (lock #op "w %w0, %1\n" \
|
||||||
|
: "+r" (__ret), "+m" (*(ptr)) \
|
||||||
|
: : "memory", "cc"); \
|
||||||
|
break; \
|
||||||
|
case __X86_CASE_L: \
|
||||||
|
asm volatile (lock #op "l %0, %1\n" \
|
||||||
|
: "+r" (__ret), "+m" (*(ptr)) \
|
||||||
|
: : "memory", "cc"); \
|
||||||
|
break; \
|
||||||
|
case __X86_CASE_Q: \
|
||||||
|
asm volatile (lock #op "q %q0, %1\n" \
|
||||||
|
: "+r" (__ret), "+m" (*(ptr)) \
|
||||||
|
: : "memory", "cc"); \
|
||||||
|
break; \
|
||||||
|
default: \
|
||||||
|
__xchg_wrong_size(); \
|
||||||
|
} \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
|
||||||
|
* Since this is generally used to protect other memory information, we
|
||||||
|
* use "asm volatile" and "memory" clobbers to prevent gcc from moving
|
||||||
|
* information around.
|
||||||
|
*/
|
||||||
|
#define xchg(ptr, v) __xchg_op((ptr), (v), xchg, "")
|
||||||
|
|
||||||
static inline unsigned long atomic_cmpxchg8(unsigned long *addr,
|
static inline unsigned long atomic_cmpxchg8(unsigned long *addr,
|
||||||
unsigned long oldval,
|
unsigned long oldval,
|
||||||
@@ -241,4 +271,66 @@ static inline unsigned long ihk_atomic_add_long_return(long i, long *v) {
|
|||||||
return i + __i;
|
return i + __i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern void __cmpxchg_wrong_size(void)
|
||||||
|
__compiletime_error("Bad argument size for cmpxchg");
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Atomic compare and exchange. Compare OLD with MEM, if identical,
|
||||||
|
* store NEW in MEM. Return the initial value in MEM. Success is
|
||||||
|
* indicated by comparing RETURN with OLD.
|
||||||
|
*/
|
||||||
|
#define __raw_cmpxchg(ptr, old, new, size, lock) \
|
||||||
|
({ \
|
||||||
|
__typeof__(*(ptr)) __ret; \
|
||||||
|
__typeof__(*(ptr)) __old = (old); \
|
||||||
|
__typeof__(*(ptr)) __new = (new); \
|
||||||
|
switch (size) { \
|
||||||
|
case __X86_CASE_B: \
|
||||||
|
{ \
|
||||||
|
volatile uint8_t *__ptr = (volatile uint8_t *)(ptr);\
|
||||||
|
asm volatile(lock "cmpxchgb %2,%1" \
|
||||||
|
: "=a" (__ret), "+m" (*__ptr) \
|
||||||
|
: "q" (__new), "0" (__old) \
|
||||||
|
: "memory"); \
|
||||||
|
break; \
|
||||||
|
} \
|
||||||
|
case __X86_CASE_W: \
|
||||||
|
{ \
|
||||||
|
volatile uint16_t *__ptr = (volatile uint16_t *)(ptr);\
|
||||||
|
asm volatile(lock "cmpxchgw %2,%1" \
|
||||||
|
: "=a" (__ret), "+m" (*__ptr) \
|
||||||
|
: "r" (__new), "0" (__old) \
|
||||||
|
: "memory"); \
|
||||||
|
break; \
|
||||||
|
} \
|
||||||
|
case __X86_CASE_L: \
|
||||||
|
{ \
|
||||||
|
volatile uint32_t *__ptr = (volatile uint32_t *)(ptr);\
|
||||||
|
asm volatile(lock "cmpxchgl %2,%1" \
|
||||||
|
: "=a" (__ret), "+m" (*__ptr) \
|
||||||
|
: "r" (__new), "0" (__old) \
|
||||||
|
: "memory"); \
|
||||||
|
break; \
|
||||||
|
} \
|
||||||
|
case __X86_CASE_Q: \
|
||||||
|
{ \
|
||||||
|
volatile uint64_t *__ptr = (volatile uint64_t *)(ptr);\
|
||||||
|
asm volatile(lock "cmpxchgq %2,%1" \
|
||||||
|
: "=a" (__ret), "+m" (*__ptr) \
|
||||||
|
: "r" (__new), "0" (__old) \
|
||||||
|
: "memory"); \
|
||||||
|
break; \
|
||||||
|
} \
|
||||||
|
default: \
|
||||||
|
__cmpxchg_wrong_size(); \
|
||||||
|
} \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
|
||||||
|
#define __cmpxchg(ptr, old, new, size) \
|
||||||
|
__raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
|
||||||
|
|
||||||
|
#define cmpxchg(ptr, old, new) \
|
||||||
|
__cmpxchg(ptr, old, new, sizeof(*(ptr)))
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -71,7 +71,7 @@
|
|||||||
#define MSR_PERF_CTL_0 0xc0010000
|
#define MSR_PERF_CTL_0 0xc0010000
|
||||||
#define MSR_PERF_CTR_0 0xc0010004
|
#define MSR_PERF_CTR_0 0xc0010004
|
||||||
|
|
||||||
static unsigned long xgetbv(unsigned int index)
|
static inline unsigned long xgetbv(unsigned int index)
|
||||||
{
|
{
|
||||||
unsigned int low, high;
|
unsigned int low, high;
|
||||||
|
|
||||||
@@ -80,7 +80,7 @@ static unsigned long xgetbv(unsigned int index)
|
|||||||
return low | ((unsigned long)high << 32);
|
return low | ((unsigned long)high << 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void xsetbv(unsigned int index, unsigned long val)
|
static inline void xsetbv(unsigned int index, unsigned long val)
|
||||||
{
|
{
|
||||||
unsigned int low, high;
|
unsigned int low, high;
|
||||||
|
|
||||||
@@ -90,7 +90,8 @@ static void xsetbv(unsigned int index, unsigned long val)
|
|||||||
asm volatile("xsetbv" : : "a" (low), "d" (high), "c" (index));
|
asm volatile("xsetbv" : : "a" (low), "d" (high), "c" (index));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void wrmsr(unsigned int idx, unsigned long value){
|
static inline void wrmsr(unsigned int idx, unsigned long value)
|
||||||
|
{
|
||||||
unsigned int high, low;
|
unsigned int high, low;
|
||||||
|
|
||||||
high = value >> 32;
|
high = value >> 32;
|
||||||
@@ -99,7 +100,7 @@ static void wrmsr(unsigned int idx, unsigned long value){
|
|||||||
asm volatile("wrmsr" : : "c" (idx), "a" (low), "d" (high) : "memory");
|
asm volatile("wrmsr" : : "c" (idx), "a" (low), "d" (high) : "memory");
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long rdpmc(unsigned int counter)
|
static inline unsigned long rdpmc(unsigned int counter)
|
||||||
{
|
{
|
||||||
unsigned int high, low;
|
unsigned int high, low;
|
||||||
|
|
||||||
@@ -108,7 +109,7 @@ static unsigned long rdpmc(unsigned int counter)
|
|||||||
return (unsigned long)high << 32 | low;
|
return (unsigned long)high << 32 | low;
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long rdmsr(unsigned int index)
|
static inline unsigned long rdmsr(unsigned int index)
|
||||||
{
|
{
|
||||||
unsigned int high, low;
|
unsigned int high, low;
|
||||||
|
|
||||||
@@ -117,7 +118,7 @@ static unsigned long rdmsr(unsigned int index)
|
|||||||
return (unsigned long)high << 32 | low;
|
return (unsigned long)high << 32 | low;
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long rdtsc(void)
|
static inline unsigned long rdtsc(void)
|
||||||
{
|
{
|
||||||
unsigned int high, low;
|
unsigned int high, low;
|
||||||
|
|
||||||
@@ -126,7 +127,7 @@ static unsigned long rdtsc(void)
|
|||||||
return (unsigned long)high << 32 | low;
|
return (unsigned long)high << 32 | low;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void set_perfctl(int counter, int event, int mask)
|
static inline void set_perfctl(int counter, int event, int mask)
|
||||||
{
|
{
|
||||||
unsigned long value;
|
unsigned long value;
|
||||||
|
|
||||||
@@ -137,7 +138,7 @@ static void set_perfctl(int counter, int event, int mask)
|
|||||||
wrmsr(MSR_PERF_CTL_0 + counter, value);
|
wrmsr(MSR_PERF_CTL_0 + counter, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void start_perfctr(int counter)
|
static inline void start_perfctr(int counter)
|
||||||
{
|
{
|
||||||
unsigned long value;
|
unsigned long value;
|
||||||
|
|
||||||
@@ -145,7 +146,7 @@ static void start_perfctr(int counter)
|
|||||||
value |= (1 << 22);
|
value |= (1 << 22);
|
||||||
wrmsr(MSR_PERF_CTL_0 + counter, value);
|
wrmsr(MSR_PERF_CTL_0 + counter, value);
|
||||||
}
|
}
|
||||||
static void stop_perfctr(int counter)
|
static inline void stop_perfctr(int counter)
|
||||||
{
|
{
|
||||||
unsigned long value;
|
unsigned long value;
|
||||||
|
|
||||||
@@ -154,17 +155,17 @@ static void stop_perfctr(int counter)
|
|||||||
wrmsr(MSR_PERF_CTL_0 + counter, value);
|
wrmsr(MSR_PERF_CTL_0 + counter, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void clear_perfctl(int counter)
|
static inline void clear_perfctl(int counter)
|
||||||
{
|
{
|
||||||
wrmsr(MSR_PERF_CTL_0 + counter, 0);
|
wrmsr(MSR_PERF_CTL_0 + counter, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void set_perfctr(int counter, unsigned long value)
|
static inline void set_perfctr(int counter, unsigned long value)
|
||||||
{
|
{
|
||||||
wrmsr(MSR_PERF_CTR_0 + counter, value);
|
wrmsr(MSR_PERF_CTR_0 + counter, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long read_perfctr(int counter)
|
static inline unsigned long read_perfctr(int counter)
|
||||||
{
|
{
|
||||||
return rdpmc(counter);
|
return rdpmc(counter);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -74,6 +74,7 @@ SYSCALL_DELEGATED(89, readlink)
|
|||||||
SYSCALL_HANDLED(96, gettimeofday)
|
SYSCALL_HANDLED(96, gettimeofday)
|
||||||
SYSCALL_HANDLED(97, getrlimit)
|
SYSCALL_HANDLED(97, getrlimit)
|
||||||
SYSCALL_HANDLED(98, getrusage)
|
SYSCALL_HANDLED(98, getrusage)
|
||||||
|
SYSCALL_HANDLED(99, sysinfo)
|
||||||
SYSCALL_HANDLED(100, times)
|
SYSCALL_HANDLED(100, times)
|
||||||
SYSCALL_HANDLED(101, ptrace)
|
SYSCALL_HANDLED(101, ptrace)
|
||||||
SYSCALL_HANDLED(102, getuid)
|
SYSCALL_HANDLED(102, getuid)
|
||||||
@@ -147,24 +148,23 @@ SYSCALL_DELEGATED(266, symlinkat)
|
|||||||
SYSCALL_DELEGATED(267, readlinkat)
|
SYSCALL_DELEGATED(267, readlinkat)
|
||||||
SYSCALL_DELEGATED(268, fchmodat)
|
SYSCALL_DELEGATED(268, fchmodat)
|
||||||
SYSCALL_DELEGATED(269, faccessat)
|
SYSCALL_DELEGATED(269, faccessat)
|
||||||
SYSCALL_DELEGATED(270, pselect6)
|
SYSCALL_HANDLED(270, pselect6)
|
||||||
SYSCALL_DELEGATED(271, ppoll)
|
SYSCALL_HANDLED(271, ppoll)
|
||||||
SYSCALL_HANDLED(273, set_robust_list)
|
SYSCALL_HANDLED(273, set_robust_list)
|
||||||
SYSCALL_HANDLED(279, move_pages)
|
SYSCALL_HANDLED(279, move_pages)
|
||||||
SYSCALL_DELEGATED(281, epoll_pwait)
|
SYSCALL_HANDLED(281, epoll_pwait)
|
||||||
SYSCALL_HANDLED(282, signalfd)
|
SYSCALL_HANDLED(282, signalfd)
|
||||||
SYSCALL_HANDLED(289, signalfd4)
|
SYSCALL_HANDLED(289, signalfd4)
|
||||||
|
#ifdef ENABLE_PERF
|
||||||
SYSCALL_HANDLED(298, perf_event_open)
|
SYSCALL_HANDLED(298, perf_event_open)
|
||||||
|
#endif
|
||||||
#ifdef DCFA_KMOD
|
#ifdef DCFA_KMOD
|
||||||
SYSCALL_HANDLED(303, mod_call)
|
SYSCALL_HANDLED(303, mod_call)
|
||||||
#endif
|
#endif
|
||||||
SYSCALL_HANDLED(309, getcpu)
|
SYSCALL_HANDLED(309, getcpu)
|
||||||
SYSCALL_HANDLED(310, process_vm_readv)
|
SYSCALL_HANDLED(310, process_vm_readv)
|
||||||
SYSCALL_HANDLED(311, process_vm_writev)
|
SYSCALL_HANDLED(311, process_vm_writev)
|
||||||
SYSCALL_HANDLED(601, pmc_init)
|
SYSCALL_HANDLED(322, execveat)
|
||||||
SYSCALL_HANDLED(602, pmc_start)
|
|
||||||
SYSCALL_HANDLED(603, pmc_stop)
|
|
||||||
SYSCALL_HANDLED(604, pmc_reset)
|
|
||||||
SYSCALL_HANDLED(700, get_cpu_id)
|
SYSCALL_HANDLED(700, get_cpu_id)
|
||||||
#ifdef PROFILE_ENABLE
|
#ifdef PROFILE_ENABLE
|
||||||
SYSCALL_HANDLED(__NR_profile, profile)
|
SYSCALL_HANDLED(__NR_profile, profile)
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
/* interrupt.S COPYRIGHT FUJITSU LIMITED 2019 */
|
||||||
/**
|
/**
|
||||||
* \file interrupt.S
|
* \file interrupt.S
|
||||||
* License details are found in the file LICENSE.
|
* License details are found in the file LICENSE.
|
||||||
@@ -91,6 +92,9 @@ vector=vector+1
|
|||||||
.endr
|
.endr
|
||||||
|
|
||||||
common_interrupt:
|
common_interrupt:
|
||||||
|
#define MULT_INTR_VECTOR 242
|
||||||
|
cmp $(MULT_INTR_VECTOR),%rdi
|
||||||
|
je 1f
|
||||||
PUSH_ALL_REGS
|
PUSH_ALL_REGS
|
||||||
movq ERROR_OFFSET(%rsp), %rdi
|
movq ERROR_OFFSET(%rsp), %rdi
|
||||||
movq %rsp, %rsi
|
movq %rsp, %rsi
|
||||||
@@ -99,6 +103,19 @@ common_interrupt:
|
|||||||
addq $8, %rsp
|
addq $8, %rsp
|
||||||
iretq
|
iretq
|
||||||
|
|
||||||
|
|
||||||
|
.globl nmi_handler
|
||||||
|
nmi_handler:
|
||||||
|
cld
|
||||||
|
pushq $0 /* error field of x86_basic_regs */
|
||||||
|
PUSH_ALL_REGS
|
||||||
|
movq %rsp, %rdi
|
||||||
|
call multi_nm_interrupt_handler /* Enter C code */
|
||||||
|
POP_ALL_REGS
|
||||||
|
addq $8, %rsp
|
||||||
|
iretq
|
||||||
|
|
||||||
|
|
||||||
.globl __page_fault_handler_address
|
.globl __page_fault_handler_address
|
||||||
__page_fault_handler_address:
|
__page_fault_handler_address:
|
||||||
.quad 0
|
.quad 0
|
||||||
@@ -137,74 +154,6 @@ __freeze:
|
|||||||
POP_ALL_REGS
|
POP_ALL_REGS
|
||||||
iretq
|
iretq
|
||||||
|
|
||||||
.globl nmi
|
|
||||||
nmi:
|
|
||||||
#define PANICED 232
|
|
||||||
#define PANIC_REGS 240
|
|
||||||
movq %rax,%gs:PANIC_REGS+0x00
|
|
||||||
movq %rsp,%gs:PANIC_REGS+0x08
|
|
||||||
|
|
||||||
movl nmi_mode(%rip),%eax
|
|
||||||
cmp $3,%rax
|
|
||||||
je 4f
|
|
||||||
cmp $1,%rax
|
|
||||||
je 1f
|
|
||||||
cmp $2,%rax
|
|
||||||
jne 3f
|
|
||||||
1:
|
|
||||||
cld
|
|
||||||
movq %gs:PANIC_REGS+0x00,%rax
|
|
||||||
PUSH_ALL_REGS
|
|
||||||
subq $40, %rsp
|
|
||||||
movq %rsp,%gs:PANIC_REGS+0x10
|
|
||||||
movq %rsp, %rdi
|
|
||||||
call freeze_thaw
|
|
||||||
cmpq $0, %rax
|
|
||||||
jnz 2f
|
|
||||||
addq $40, %rsp
|
|
||||||
2:
|
|
||||||
POP_ALL_REGS
|
|
||||||
iretq
|
|
||||||
3:
|
|
||||||
movq %rbx,%gs:PANIC_REGS+0x08
|
|
||||||
movq %rcx,%gs:PANIC_REGS+0x10
|
|
||||||
movq %rdx,%gs:PANIC_REGS+0x18
|
|
||||||
movq %rsi,%gs:PANIC_REGS+0x20
|
|
||||||
movq %rdi,%gs:PANIC_REGS+0x28
|
|
||||||
movq %rbp,%gs:PANIC_REGS+0x30
|
|
||||||
movq 0x18(%rsp),%rax /* rsp */
|
|
||||||
movq %rax,%gs:PANIC_REGS+0x38
|
|
||||||
movq %r8, %gs:PANIC_REGS+0x40
|
|
||||||
movq %r9, %gs:PANIC_REGS+0x48
|
|
||||||
movq %r10,%gs:PANIC_REGS+0x50
|
|
||||||
movq %r11,%gs:PANIC_REGS+0x58
|
|
||||||
movq %r12,%gs:PANIC_REGS+0x60
|
|
||||||
movq %r13,%gs:PANIC_REGS+0x68
|
|
||||||
movq %r14,%gs:PANIC_REGS+0x70
|
|
||||||
movq %r15,%gs:PANIC_REGS+0x78
|
|
||||||
movq 0x00(%rsp),%rax /* rip */
|
|
||||||
movq %rax,%gs:PANIC_REGS+0x80
|
|
||||||
movq 0x10(%rsp),%rax /* rflags */
|
|
||||||
movl %eax,%gs:PANIC_REGS+0x88
|
|
||||||
movq 0x08(%rsp),%rax /* cs */
|
|
||||||
movl %eax,%gs:PANIC_REGS+0x8C
|
|
||||||
movq 0x20(%rsp),%rax /* ss */
|
|
||||||
movl %eax,%gs:PANIC_REGS+0x90
|
|
||||||
xorq %rax,%rax
|
|
||||||
movw %ds,%ax
|
|
||||||
movl %eax,%gs:PANIC_REGS+0x94
|
|
||||||
movw %es,%ax
|
|
||||||
movl %eax,%gs:PANIC_REGS+0x98
|
|
||||||
movw %fs,%ax
|
|
||||||
movl %eax,%gs:PANIC_REGS+0x9C
|
|
||||||
movw %gs,%ax
|
|
||||||
movl %eax,%gs:PANIC_REGS+0xA0
|
|
||||||
movq $1,%gs:PANICED
|
|
||||||
call ihk_mc_query_mem_areas
|
|
||||||
4:
|
|
||||||
hlt
|
|
||||||
jmp 4b
|
|
||||||
|
|
||||||
.globl x86_syscall
|
.globl x86_syscall
|
||||||
x86_syscall:
|
x86_syscall:
|
||||||
cld
|
cld
|
||||||
|
|||||||
@@ -14,7 +14,6 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include <ihk/cpu.h>
|
#include <ihk/cpu.h>
|
||||||
#include <ihk/debug.h>
|
|
||||||
#include <ihk/mm.h>
|
#include <ihk/mm.h>
|
||||||
#include <types.h>
|
#include <types.h>
|
||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
@@ -26,7 +25,7 @@
|
|||||||
#include <cls.h>
|
#include <cls.h>
|
||||||
#include <kmalloc.h>
|
#include <kmalloc.h>
|
||||||
#include <rusage_private.h>
|
#include <rusage_private.h>
|
||||||
#include <debug.h>
|
#include <ihk/debug.h>
|
||||||
|
|
||||||
//#define DEBUG
|
//#define DEBUG
|
||||||
|
|
||||||
@@ -38,6 +37,7 @@
|
|||||||
static char *last_page;
|
static char *last_page;
|
||||||
extern char _head[], _end[];
|
extern char _head[], _end[];
|
||||||
|
|
||||||
|
extern unsigned long linux_page_offset_base;
|
||||||
extern unsigned long x86_kernel_phys_base;
|
extern unsigned long x86_kernel_phys_base;
|
||||||
|
|
||||||
/* Arch specific early allocation routine */
|
/* Arch specific early allocation routine */
|
||||||
@@ -1355,109 +1355,6 @@ struct clear_range_args {
|
|||||||
int max_nr_addr;
|
int max_nr_addr;
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_8
|
|
||||||
void remote_flush_tlb_cpumask(struct process_vm *vm,
|
|
||||||
unsigned long addr, int cpu_id)
|
|
||||||
{
|
|
||||||
unsigned long __addr = addr;
|
|
||||||
return remote_flush_tlb_array_cpumask(vm, &__addr, 1, cpu_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
void remote_flush_tlb_array_cpumask(struct process_vm *vm,
|
|
||||||
unsigned long *addr,
|
|
||||||
int nr_addr,
|
|
||||||
int cpu_id)
|
|
||||||
{
|
|
||||||
unsigned long cpu;
|
|
||||||
int flush_ind;
|
|
||||||
struct tlb_flush_entry *flush_entry;
|
|
||||||
cpu_set_t _cpu_set;
|
|
||||||
|
|
||||||
if (addr[0]) {
|
|
||||||
flush_ind = (addr[0] >> PAGE_SHIFT) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
|
|
||||||
}
|
|
||||||
/* Zero address denotes full TLB flush */
|
|
||||||
else {
|
|
||||||
/* Random.. */
|
|
||||||
flush_ind = (rdtsc()) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
|
|
||||||
}
|
|
||||||
|
|
||||||
flush_entry = &tlb_flush_vector[flush_ind];
|
|
||||||
|
|
||||||
/* Take a copy of the cpu set so that we don't hold the lock
|
|
||||||
* all the way while interrupting other cores */
|
|
||||||
ihk_mc_spinlock_lock_noirq(&vm->address_space->cpu_set_lock);
|
|
||||||
memcpy(&_cpu_set, &vm->address_space->cpu_set, sizeof(cpu_set_t));
|
|
||||||
ihk_mc_spinlock_unlock_noirq(&vm->address_space->cpu_set_lock);
|
|
||||||
|
|
||||||
dkprintf("trying to aquire flush_entry->lock flush_ind: %d\n", flush_ind);
|
|
||||||
|
|
||||||
ihk_mc_spinlock_lock_noirq(&flush_entry->lock);
|
|
||||||
|
|
||||||
flush_entry->vm = vm;
|
|
||||||
flush_entry->addr = addr;
|
|
||||||
flush_entry->nr_addr = nr_addr;
|
|
||||||
ihk_atomic_set(&flush_entry->pending, 0);
|
|
||||||
|
|
||||||
dkprintf("lock aquired, iterating cpu mask.. flush_ind: %d\n", flush_ind);
|
|
||||||
|
|
||||||
/* Loop through CPUs in this address space and interrupt them for
|
|
||||||
* TLB flush on the specified address */
|
|
||||||
for_each_set_bit(cpu, (const unsigned long*)&_cpu_set.__bits, CPU_SETSIZE) {
|
|
||||||
|
|
||||||
if (ihk_mc_get_processor_id() == cpu)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
ihk_atomic_inc(&flush_entry->pending);
|
|
||||||
dkprintf("remote_flush_tlb_cpumask: flush_ind: %d, addr: 0x%lX, interrupting cpu: %d\n",
|
|
||||||
flush_ind, addr, cpu);
|
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_8 /* arch depend hide */
|
|
||||||
/* TODO(pka_idke) Interim support */
|
|
||||||
ihk_mc_interrupt_cpu(cpu,
|
|
||||||
ihk_mc_get_vector(flush_ind + IHK_TLB_FLUSH_IRQ_VECTOR_START));
|
|
||||||
#else /* POSTK_DEBUG_ARCH_DEP_8 */
|
|
||||||
ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(cpu)->apic_id,
|
|
||||||
flush_ind + IHK_TLB_FLUSH_IRQ_VECTOR_START);
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_8 */
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef DEBUG_IC_TLB
|
|
||||||
{
|
|
||||||
unsigned long tsc;
|
|
||||||
tsc = rdtsc() + 12884901888; /* 1.2GHz =>10 sec */
|
|
||||||
#endif
|
|
||||||
if (flush_entry->addr[0]) {
|
|
||||||
int i;
|
|
||||||
|
|
||||||
for (i = 0; i < flush_entry->nr_addr; ++i) {
|
|
||||||
flush_tlb_single(flush_entry->addr[i] & PAGE_MASK);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* Zero address denotes full TLB flush */
|
|
||||||
else {
|
|
||||||
flush_tlb();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Wait for all cores */
|
|
||||||
while (ihk_atomic_read(&flush_entry->pending) != 0) {
|
|
||||||
cpu_pause();
|
|
||||||
|
|
||||||
#ifdef DEBUG_IC_TLB
|
|
||||||
if (rdtsc() > tsc) {
|
|
||||||
kprintf("waited 10 secs for remote TLB!! -> panic_all()\n");
|
|
||||||
panic_all_cores("waited 10 secs for remote TLB!!\n");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
#ifdef DEBUG_IC_TLB
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ihk_mc_spinlock_unlock_noirq(&flush_entry->lock);
|
|
||||||
}
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_8 */
|
|
||||||
|
|
||||||
static void remote_flush_tlb_add_addr(struct clear_range_args *args,
|
static void remote_flush_tlb_add_addr(struct clear_range_args *args,
|
||||||
unsigned long addr)
|
unsigned long addr)
|
||||||
{
|
{
|
||||||
@@ -1622,7 +1519,7 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
|
|||||||
{
|
{
|
||||||
struct clear_range_args *args = args0;
|
struct clear_range_args *args = args0;
|
||||||
int error;
|
int error;
|
||||||
uint64_t phys;
|
uint64_t phys = 0;
|
||||||
pte_t old;
|
pte_t old;
|
||||||
struct page *page;
|
struct page *page;
|
||||||
struct page_table *pt;
|
struct page_table *pt;
|
||||||
@@ -2572,10 +2469,10 @@ static void init_linux_kernel_mapping(struct page_table *pt)
|
|||||||
map_start = 0;
|
map_start = 0;
|
||||||
map_end = 0x20000000000;
|
map_end = 0x20000000000;
|
||||||
|
|
||||||
virt = (void *)LINUX_PAGE_OFFSET;
|
virt = (void *)linux_page_offset_base;
|
||||||
|
|
||||||
kprintf("Linux kernel virtual: 0x%lx - 0x%lx -> 0x%lx - 0x%lx\n",
|
kprintf("Linux kernel virtual: 0x%lx - 0x%lx -> 0x%lx - 0x%lx\n",
|
||||||
LINUX_PAGE_OFFSET, LINUX_PAGE_OFFSET + map_end, 0, map_end);
|
virt, virt + map_end, 0, map_end);
|
||||||
|
|
||||||
for (phys = map_start; phys < map_end; phys += LARGE_PAGE_SIZE) {
|
for (phys = map_start; phys < map_end; phys += LARGE_PAGE_SIZE) {
|
||||||
if (set_pt_large_page(pt, virt, phys, PTATTR_WRITABLE) != 0) {
|
if (set_pt_large_page(pt, virt, phys, PTATTR_WRITABLE) != 0) {
|
||||||
@@ -2599,9 +2496,11 @@ static void init_linux_kernel_mapping(struct page_table *pt)
|
|||||||
}
|
}
|
||||||
|
|
||||||
dkprintf("Linux kernel virtual: 0x%lx - 0x%lx -> 0x%lx - 0x%lx\n",
|
dkprintf("Linux kernel virtual: 0x%lx - 0x%lx -> 0x%lx - 0x%lx\n",
|
||||||
LINUX_PAGE_OFFSET + map_start, LINUX_PAGE_OFFSET + map_end, map_start, map_end);
|
linux_page_offset_base + map_start,
|
||||||
|
linux_page_offset_base + map_end,
|
||||||
|
map_start, map_end);
|
||||||
|
|
||||||
virt = (void *)(LINUX_PAGE_OFFSET + map_start);
|
virt = (void *)(linux_page_offset_base + map_start);
|
||||||
for (phys = map_start; phys < map_end; phys += LARGE_PAGE_SIZE, virt += LARGE_PAGE_SIZE) {
|
for (phys = map_start; phys < map_end; phys += LARGE_PAGE_SIZE, virt += LARGE_PAGE_SIZE) {
|
||||||
if (set_pt_large_page(pt, virt, phys, PTATTR_WRITABLE) != 0) {
|
if (set_pt_large_page(pt, virt, phys, PTATTR_WRITABLE) != 0) {
|
||||||
kprintf("%s: set_pt_large_page() failed for 0x%lx\n", __FUNCTION__, virt);
|
kprintf("%s: set_pt_large_page() failed for 0x%lx\n", __FUNCTION__, virt);
|
||||||
@@ -2652,7 +2551,7 @@ void *map_fixed_area(unsigned long phys, unsigned long size, int uncachable)
|
|||||||
attr |= PTATTR_UNCACHABLE;
|
attr |= PTATTR_UNCACHABLE;
|
||||||
}
|
}
|
||||||
|
|
||||||
kprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
|
dkprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
|
||||||
paligned, v, npages);
|
paligned, v, npages);
|
||||||
|
|
||||||
for (i = 0; i < npages; i++) {
|
for (i = 0; i < npages; i++) {
|
||||||
@@ -2745,12 +2644,12 @@ unsigned long virt_to_phys(void *v)
|
|||||||
unsigned long va = (unsigned long)v;
|
unsigned long va = (unsigned long)v;
|
||||||
|
|
||||||
if (va >= MAP_KERNEL_START) {
|
if (va >= MAP_KERNEL_START) {
|
||||||
dkprintf("%s: MAP_KERNEL_START <= 0x%lx <= LINUX_PAGE_OFFSET\n",
|
dkprintf("%s: MAP_KERNEL_START <= 0x%lx <= linux_page_offset_base\n",
|
||||||
__FUNCTION__, va);
|
__FUNCTION__, va);
|
||||||
return va - MAP_KERNEL_START + x86_kernel_phys_base;
|
return va - MAP_KERNEL_START + x86_kernel_phys_base;
|
||||||
}
|
}
|
||||||
else if (va >= LINUX_PAGE_OFFSET) {
|
else if (va >= linux_page_offset_base) {
|
||||||
return va - LINUX_PAGE_OFFSET;
|
return va - linux_page_offset_base;
|
||||||
}
|
}
|
||||||
else if (va >= MAP_FIXED_START) {
|
else if (va >= MAP_FIXED_START) {
|
||||||
return va - MAP_FIXED_START;
|
return va - MAP_FIXED_START;
|
||||||
@@ -2769,7 +2668,7 @@ void *phys_to_virt(unsigned long p)
|
|||||||
return (void *)(p + MAP_ST_START);
|
return (void *)(p + MAP_ST_START);
|
||||||
}
|
}
|
||||||
|
|
||||||
return (void *)(p + LINUX_PAGE_OFFSET);
|
return (void *)(p + linux_page_offset_base);
|
||||||
}
|
}
|
||||||
|
|
||||||
int copy_from_user(void *dst, const void *src, size_t siz)
|
int copy_from_user(void *dst, const void *src, size_t siz)
|
||||||
|
|||||||
@@ -12,12 +12,12 @@
|
|||||||
#include <march.h>
|
#include <march.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <cls.h>
|
#include <cls.h>
|
||||||
#include <ihk/debug.h>
|
|
||||||
#include <ihk/cpu.h>
|
#include <ihk/cpu.h>
|
||||||
#include <registers.h>
|
#include <registers.h>
|
||||||
#include <mc_perf_event.h>
|
#include <mc_perf_event.h>
|
||||||
#include <config.h>
|
#include <config.h>
|
||||||
#include <debug.h>
|
#include <ihk/debug.h>
|
||||||
|
#include <process.h>
|
||||||
|
|
||||||
extern unsigned int *x86_march_perfmap;
|
extern unsigned int *x86_march_perfmap;
|
||||||
extern int running_on_kvm(void);
|
extern int running_on_kvm(void);
|
||||||
@@ -223,41 +223,6 @@ int ihk_mc_perfctr_init_raw(int counter, unsigned int code, int mode)
|
|||||||
#endif /*POSTK_DEBUG_TEMP_FIX_29*/
|
#endif /*POSTK_DEBUG_TEMP_FIX_29*/
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_TEMP_FIX_29
|
|
||||||
int ihk_mc_perfctr_init(int counter, uint64_t config, int mode)
|
|
||||||
#else
|
|
||||||
int ihk_mc_perfctr_init(int counter, enum ihk_perfctr_type type, int mode)
|
|
||||||
#endif /*POSTK_DEBUG_TEMP_FIX_29*/
|
|
||||||
{
|
|
||||||
#ifdef POSTK_DEBUG_TEMP_FIX_29
|
|
||||||
enum ihk_perfctr_type type;
|
|
||||||
|
|
||||||
switch (config) {
|
|
||||||
case PERF_COUNT_HW_CPU_CYCLES :
|
|
||||||
type = APT_TYPE_CYCLE;
|
|
||||||
break;
|
|
||||||
case PERF_COUNT_HW_INSTRUCTIONS :
|
|
||||||
type = APT_TYPE_INSTRUCTIONS;
|
|
||||||
break;
|
|
||||||
default :
|
|
||||||
// Not supported config.
|
|
||||||
type = PERFCTR_MAX_TYPE;
|
|
||||||
}
|
|
||||||
#endif /*POSTK_DEBUG_TEMP_FIX_29*/
|
|
||||||
|
|
||||||
if (counter < 0 || counter >= NUM_PERF_COUNTERS) {
|
|
||||||
return -EINVAL;
|
|
||||||
}
|
|
||||||
if (type < 0 || type >= PERFCTR_MAX_TYPE) {
|
|
||||||
return -EINVAL;
|
|
||||||
}
|
|
||||||
if (!x86_march_perfmap[type]) {
|
|
||||||
return -EINVAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return set_perfctr_x86_direct(counter, mode, x86_march_perfmap[type]);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ihk_mc_perfctr_set_extra(struct mc_perf_event *event)
|
int ihk_mc_perfctr_set_extra(struct mc_perf_event *event)
|
||||||
{
|
{
|
||||||
struct thread *thread = cpu_local_var(current);
|
struct thread *thread = cpu_local_var(current);
|
||||||
@@ -412,6 +377,23 @@ int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ihk_mc_perfctr_alloc(struct thread *thread, struct mc_perf_event *event)
|
||||||
|
{
|
||||||
|
int ret = -EINVAL;
|
||||||
|
int i = 0;
|
||||||
|
const int counters = ihk_mc_perf_get_num_counters();
|
||||||
|
|
||||||
|
// find avail generic counter
|
||||||
|
for (i = 0; i < counters; i++) {
|
||||||
|
if (!(thread->pmc_alloc_map & (1 << i))) {
|
||||||
|
ret = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
unsigned long ihk_mc_perfctr_read(int counter)
|
unsigned long ihk_mc_perfctr_read(int counter)
|
||||||
{
|
{
|
||||||
unsigned long retval = 0;
|
unsigned long retval = 0;
|
||||||
@@ -439,6 +421,14 @@ unsigned long ihk_mc_perfctr_read(int counter)
|
|||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned long ihk_mc_perfctr_value(int counter, unsigned long correction)
|
||||||
|
{
|
||||||
|
unsigned long count = ihk_mc_perfctr_read(counter) + correction;
|
||||||
|
|
||||||
|
count &= 0x000000ffffffffffL;
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
// read by rdmsr
|
// read by rdmsr
|
||||||
unsigned long ihk_mc_perfctr_read_msr(int counter)
|
unsigned long ihk_mc_perfctr_read_msr(int counter)
|
||||||
{
|
{
|
||||||
@@ -513,3 +503,18 @@ int ihk_mc_perf_get_num_counters(void)
|
|||||||
{
|
{
|
||||||
return NUM_PERF_COUNTERS;
|
return NUM_PERF_COUNTERS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int hw_perf_event_init(struct mc_perf_event *event)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ihk_mc_event_set_period(struct mc_perf_event *event)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t ihk_mc_event_update(struct mc_perf_event *event)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|||||||
@@ -16,7 +16,6 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include <ihk/cpu.h>
|
#include <ihk/cpu.h>
|
||||||
#include <ihk/debug.h>
|
|
||||||
#include <cls.h>
|
#include <cls.h>
|
||||||
#include <cpulocal.h>
|
#include <cpulocal.h>
|
||||||
#include <syscall.h>
|
#include <syscall.h>
|
||||||
@@ -32,7 +31,9 @@
|
|||||||
#include <page.h>
|
#include <page.h>
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <syscall.h>
|
#include <syscall.h>
|
||||||
#include <debug.h>
|
#include <bitops.h>
|
||||||
|
#include <rusage_private.h>
|
||||||
|
#include <ihk/debug.h>
|
||||||
|
|
||||||
void terminate_mcexec(int, int);
|
void terminate_mcexec(int, int);
|
||||||
extern long do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact);
|
extern long do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact);
|
||||||
@@ -64,7 +65,6 @@ uintptr_t debug_constants[] = {
|
|||||||
-1,
|
-1,
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_52
|
|
||||||
#define VDSO_MAXPAGES 2
|
#define VDSO_MAXPAGES 2
|
||||||
struct vdso {
|
struct vdso {
|
||||||
long busy;
|
long busy;
|
||||||
@@ -80,8 +80,24 @@ struct vdso {
|
|||||||
long hpet_phys;
|
long hpet_phys;
|
||||||
void *pvti_virt;
|
void *pvti_virt;
|
||||||
long pvti_phys;
|
long pvti_phys;
|
||||||
|
void *vgtod_virt;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct vsyscall_gtod_data {
|
||||||
|
int seq;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
int vclock_mode;
|
||||||
|
unsigned long cycle_last;
|
||||||
|
unsigned long mask;
|
||||||
|
unsigned int mult;
|
||||||
|
unsigned int shift;
|
||||||
|
} clock;
|
||||||
|
|
||||||
|
/* open coded 'struct timespec' */
|
||||||
|
time_t wall_time_sec;
|
||||||
|
unsigned long wall_time_snsec;
|
||||||
};
|
};
|
||||||
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
|
|
||||||
|
|
||||||
static struct vdso vdso;
|
static struct vdso vdso;
|
||||||
static size_t container_size = 0;
|
static size_t container_size = 0;
|
||||||
@@ -139,7 +155,7 @@ arch_clear_host_user_space()
|
|||||||
|
|
||||||
/* XXX: might be unnecessary */
|
/* XXX: might be unnecessary */
|
||||||
clear_host_pte(th->vm->region.user_start,
|
clear_host_pte(th->vm->region.user_start,
|
||||||
(th->vm->region.user_end - th->vm->region.user_start));
|
(th->vm->region.user_end - th->vm->region.user_start), 0);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -558,7 +574,7 @@ long ptrace_write_regset(struct thread *thread, long type, struct iovec *iov)
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
extern void coredump(struct thread *thread, void *regs);
|
extern int coredump(struct thread *thread, void *regs, int sig);
|
||||||
|
|
||||||
void ptrace_report_signal(struct thread *thread, int sig)
|
void ptrace_report_signal(struct thread *thread, int sig)
|
||||||
{
|
{
|
||||||
@@ -726,6 +742,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
|||||||
struct mcs_rwlock_node_irqsave lock;
|
struct mcs_rwlock_node_irqsave lock;
|
||||||
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
||||||
int restart = 0;
|
int restart = 0;
|
||||||
|
int ret;
|
||||||
|
|
||||||
for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1);
|
for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1);
|
||||||
dkprintf("do_signal(): tid=%d, pid=%d, sig=%d\n", thread->tid, proc->pid, sig);
|
dkprintf("do_signal(): tid=%d, pid=%d, sig=%d\n", thread->tid, proc->pid, sig);
|
||||||
@@ -971,15 +988,6 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
|||||||
dkprintf("SIGTRAP(): woken up\n");
|
dkprintf("SIGTRAP(): woken up\n");
|
||||||
break;
|
break;
|
||||||
case SIGCONT:
|
case SIGCONT:
|
||||||
memset(&info, '\0', sizeof info);
|
|
||||||
info.si_signo = SIGCHLD;
|
|
||||||
info.si_code = CLD_CONTINUED;
|
|
||||||
info._sifields._sigchld.si_pid = proc->pid;
|
|
||||||
info._sifields._sigchld.si_status = 0x0000ffff;
|
|
||||||
do_kill(cpu_local_var(current), proc->parent->pid, -1, SIGCHLD, &info, 0);
|
|
||||||
proc->main_thread->signal_flags = SIGNAL_STOP_CONTINUED;
|
|
||||||
proc->status = PS_RUNNING;
|
|
||||||
dkprintf("do_signal,SIGCONT,do nothing\n");
|
|
||||||
break;
|
break;
|
||||||
case SIGQUIT:
|
case SIGQUIT:
|
||||||
case SIGILL:
|
case SIGILL:
|
||||||
@@ -991,9 +999,31 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
|||||||
case SIGXCPU:
|
case SIGXCPU:
|
||||||
case SIGXFSZ:
|
case SIGXFSZ:
|
||||||
core:
|
core:
|
||||||
dkprintf("do_signal,default,core,sig=%d\n", sig);
|
thread->coredump_regs =
|
||||||
coredump(thread, regs);
|
kmalloc(sizeof(struct x86_user_context),
|
||||||
coredumped = 0x80;
|
IHK_MC_AP_NOWAIT);
|
||||||
|
if (!thread->coredump_regs) {
|
||||||
|
kprintf("%s: Out of memory\n", __func__);
|
||||||
|
goto skip;
|
||||||
|
}
|
||||||
|
memcpy(thread->coredump_regs, regs,
|
||||||
|
sizeof(struct x86_user_context));
|
||||||
|
|
||||||
|
ret = coredump(thread, regs, sig);
|
||||||
|
switch (ret) {
|
||||||
|
case -EBUSY:
|
||||||
|
kprintf("%s: INFO: coredump not performed, try ulimit -c <non-zero>\n",
|
||||||
|
__func__);
|
||||||
|
break;
|
||||||
|
case 0:
|
||||||
|
coredumped = 0x80;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
kprintf("%s: ERROR: coredump failed (%d)\n",
|
||||||
|
__func__, ret);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
skip:
|
||||||
terminate(0, sig | coredumped);
|
terminate(0, sig | coredumped);
|
||||||
break;
|
break;
|
||||||
case SIGCHLD:
|
case SIGCHLD:
|
||||||
@@ -1038,7 +1068,9 @@ getsigpending(struct thread *thread, int delflag){
|
|||||||
for(x = pending->sigmask.__val[0], sig = 0; x; sig++, x >>= 1);
|
for(x = pending->sigmask.__val[0], sig = 0; x; sig++, x >>= 1);
|
||||||
k = thread->sigcommon->action + sig - 1;
|
k = thread->sigcommon->action + sig - 1;
|
||||||
if(delflag ||
|
if(delflag ||
|
||||||
(sig != SIGCHLD && sig != SIGURG) ||
|
(sig != SIGCHLD &&
|
||||||
|
sig != SIGURG &&
|
||||||
|
sig != SIGCONT) ||
|
||||||
(k->sa.sa_handler != (void *)1 &&
|
(k->sa.sa_handler != (void *)1 &&
|
||||||
k->sa.sa_handler != NULL)){
|
k->sa.sa_handler != NULL)){
|
||||||
if(!(pending->sigmask.__val[0] & w)){
|
if(!(pending->sigmask.__val[0] & w)){
|
||||||
@@ -1119,7 +1151,8 @@ check_signal(unsigned long rc, void *regs0, int num)
|
|||||||
if(thread == NULL || thread == &cpu_local_var(idle)){
|
if(thread == NULL || thread == &cpu_local_var(idle)){
|
||||||
struct thread *t;
|
struct thread *t;
|
||||||
|
|
||||||
irqstate = ihk_mc_spinlock_lock(&(cpu_local_var(runq_lock)));
|
irqstate = cpu_disable_interrupt_save();
|
||||||
|
ihk_mc_spinlock_lock_noirq(&(cpu_local_var(runq_lock)));
|
||||||
list_for_each_entry(t, &(cpu_local_var(runq)), sched_list){
|
list_for_each_entry(t, &(cpu_local_var(runq)), sched_list){
|
||||||
if(t == &cpu_local_var(idle))
|
if(t == &cpu_local_var(idle))
|
||||||
continue;
|
continue;
|
||||||
@@ -1129,7 +1162,8 @@ check_signal(unsigned long rc, void *regs0, int num)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), irqstate);
|
ihk_mc_spinlock_unlock_noirq(&(cpu_local_var(runq_lock)));
|
||||||
|
cpu_restore_interrupt(irqstate);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1185,7 +1219,9 @@ check_sig_pending_thread(struct thread *thread)
|
|||||||
for (x = pending->sigmask.__val[0], sig = 0; x;
|
for (x = pending->sigmask.__val[0], sig = 0; x;
|
||||||
sig++, x >>= 1);
|
sig++, x >>= 1);
|
||||||
k = thread->sigcommon->action + sig - 1;
|
k = thread->sigcommon->action + sig - 1;
|
||||||
if ((sig != SIGCHLD && sig != SIGURG) ||
|
if ((sig != SIGCHLD &&
|
||||||
|
sig != SIGURG &&
|
||||||
|
sig != SIGCONT) ||
|
||||||
(k->sa.sa_handler != (void *)1 &&
|
(k->sa.sa_handler != (void *)1 &&
|
||||||
k->sa.sa_handler != NULL)) {
|
k->sa.sa_handler != NULL)) {
|
||||||
if (!(pending->sigmask.__val[0] & w)) {
|
if (!(pending->sigmask.__val[0] & w)) {
|
||||||
@@ -1194,6 +1230,7 @@ check_sig_pending_thread(struct thread *thread)
|
|||||||
found = 1;
|
found = 1;
|
||||||
if (sig != SIGCHLD &&
|
if (sig != SIGCHLD &&
|
||||||
sig != SIGURG &&
|
sig != SIGURG &&
|
||||||
|
sig != SIGCONT &&
|
||||||
!k->sa.sa_handler) {
|
!k->sa.sa_handler) {
|
||||||
found = 2;
|
found = 2;
|
||||||
break;
|
break;
|
||||||
@@ -1278,7 +1315,6 @@ do_kill(struct thread *thread, int pid, int tid, int sig, siginfo_t *info,
|
|||||||
struct list_head *head = NULL;
|
struct list_head *head = NULL;
|
||||||
int rc;
|
int rc;
|
||||||
unsigned long irqstate = 0;
|
unsigned long irqstate = 0;
|
||||||
struct k_sigaction *k;
|
|
||||||
int doint;
|
int doint;
|
||||||
int found = 0;
|
int found = 0;
|
||||||
siginfo_t info0;
|
siginfo_t info0;
|
||||||
@@ -1288,6 +1324,7 @@ do_kill(struct thread *thread, int pid, int tid, int sig, siginfo_t *info,
|
|||||||
struct process_hash *phash = rset->process_hash;
|
struct process_hash *phash = rset->process_hash;
|
||||||
struct mcs_rwlock_node lock;
|
struct mcs_rwlock_node lock;
|
||||||
struct mcs_rwlock_node updatelock;
|
struct mcs_rwlock_node updatelock;
|
||||||
|
struct sig_pending *pending = NULL;
|
||||||
|
|
||||||
if(sig > 64 || sig < 0)
|
if(sig > 64 || sig < 0)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
@@ -1509,54 +1546,70 @@ done:
|
|||||||
|
|
||||||
mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node);
|
mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node);
|
||||||
|
|
||||||
/* Put signal event even when handler is SIG_IGN or SIG_DFL
|
|
||||||
because target ptraced thread must call ptrace_report_signal
|
|
||||||
in check_signal */
|
|
||||||
rc = 0;
|
rc = 0;
|
||||||
k = tthread->sigcommon->action + sig - 1;
|
|
||||||
if ((sig != SIGKILL && (tthread->ptrace & PT_TRACED)) ||
|
if (sig < 33) { // SIGRTMIN - SIGRTMAX
|
||||||
(k->sa.sa_handler != (void *)1 &&
|
list_for_each_entry(pending, head, list) {
|
||||||
(k->sa.sa_handler != NULL ||
|
if (pending->sigmask.__val[0] == mask &&
|
||||||
(sig != SIGCHLD && sig != SIGURG)))) {
|
pending->ptracecont == ptracecont)
|
||||||
struct sig_pending *pending = NULL;
|
break;
|
||||||
if (sig < 33) { // SIGRTMIN - SIGRTMAX
|
|
||||||
list_for_each_entry(pending, head, list){
|
|
||||||
if(pending->sigmask.__val[0] == mask &&
|
|
||||||
pending->ptracecont == ptracecont)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if(&pending->list == head)
|
|
||||||
pending = NULL;
|
|
||||||
}
|
}
|
||||||
if(pending == NULL){
|
if (&pending->list == head)
|
||||||
doint = 1;
|
pending = NULL;
|
||||||
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
|
}
|
||||||
if(!pending){
|
if (pending == NULL) {
|
||||||
rc = -ENOMEM;
|
doint = 1;
|
||||||
}
|
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
|
||||||
else{
|
if (!pending) {
|
||||||
memset(pending, 0, sizeof(struct sig_pending));
|
rc = -ENOMEM;
|
||||||
pending->sigmask.__val[0] = mask;
|
}
|
||||||
memcpy(&pending->info, info, sizeof(siginfo_t));
|
else {
|
||||||
pending->ptracecont = ptracecont;
|
memset(pending, 0, sizeof(struct sig_pending));
|
||||||
if(sig == SIGKILL || sig == SIGSTOP)
|
pending->sigmask.__val[0] = mask;
|
||||||
list_add(&pending->list, head);
|
memcpy(&pending->info, info, sizeof(siginfo_t));
|
||||||
else
|
pending->ptracecont = ptracecont;
|
||||||
list_add_tail(&pending->list, head);
|
if (sig == SIGKILL || sig == SIGSTOP)
|
||||||
tthread->sigevent = 1;
|
list_add(&pending->list, head);
|
||||||
}
|
else
|
||||||
|
list_add_tail(&pending->list, head);
|
||||||
|
tthread->sigevent = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
mcs_rwlock_writer_unlock_noirq(savelock, &mcs_rw_node);
|
mcs_rwlock_writer_unlock_noirq(savelock, &mcs_rw_node);
|
||||||
cpu_restore_interrupt(irqstate);
|
cpu_restore_interrupt(irqstate);
|
||||||
|
|
||||||
|
if (sig == SIGCONT || ptracecont == 1) {
|
||||||
|
/* Wake up the target only when stopped by SIGSTOP */
|
||||||
|
if (sched_wakeup_thread(tthread, PS_STOPPED) == 0) {
|
||||||
|
struct siginfo info;
|
||||||
|
|
||||||
|
tthread->proc->main_thread->signal_flags =
|
||||||
|
SIGNAL_STOP_CONTINUED;
|
||||||
|
tthread->proc->status = PS_RUNNING;
|
||||||
|
memset(&info, '\0', sizeof(info));
|
||||||
|
info.si_signo = SIGCHLD;
|
||||||
|
info.si_code = CLD_CONTINUED;
|
||||||
|
info._sifields._sigchld.si_pid = tthread->proc->pid;
|
||||||
|
info._sifields._sigchld.si_status = 0x0000ffff;
|
||||||
|
do_kill(tthread, tthread->proc->parent->pid, -1,
|
||||||
|
SIGCHLD, &info, 0);
|
||||||
|
tthread->proc->status = PS_RUNNING;
|
||||||
|
if (thread != tthread) {
|
||||||
|
ihk_mc_interrupt_cpu(tthread->cpu_id,
|
||||||
|
ihk_mc_get_vector(IHK_GV_IKC));
|
||||||
|
}
|
||||||
|
doint = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (doint && !(mask & tthread->sigmask.__val[0])) {
|
if (doint && !(mask & tthread->sigmask.__val[0])) {
|
||||||
int status = tthread->status;
|
int status = tthread->status;
|
||||||
|
|
||||||
if (thread != tthread) {
|
if (thread != tthread) {
|
||||||
dkprintf("do_kill,ipi,pid=%d,cpu_id=%d\n",
|
dkprintf("do_kill,ipi,pid=%d,cpu_id=%d\n",
|
||||||
tproc->pid, tthread->cpu_id);
|
tproc->pid, tthread->cpu_id);
|
||||||
ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(tthread->cpu_id)->apic_id, 0xd0);
|
ihk_mc_interrupt_cpu(tthread->cpu_id,
|
||||||
|
ihk_mc_get_vector(IHK_GV_IKC));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (status != PS_RUNNING) {
|
if (status != PS_RUNNING) {
|
||||||
@@ -1564,11 +1617,6 @@ done:
|
|||||||
/* Wake up the target only when stopped by ptrace-reporting */
|
/* Wake up the target only when stopped by ptrace-reporting */
|
||||||
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED | PS_INTERRUPTIBLE);
|
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED | PS_INTERRUPTIBLE);
|
||||||
}
|
}
|
||||||
else if(sig == SIGCONT || ptracecont == 1){
|
|
||||||
/* Wake up the target only when stopped by SIGSTOP */
|
|
||||||
sched_wakeup_thread(tthread, PS_STOPPED);
|
|
||||||
tthread->proc->status = PS_RUNNING;
|
|
||||||
}
|
|
||||||
else {
|
else {
|
||||||
sched_wakeup_thread(tthread, PS_INTERRUPTIBLE);
|
sched_wakeup_thread(tthread, PS_INTERRUPTIBLE);
|
||||||
}
|
}
|
||||||
@@ -1593,7 +1641,7 @@ set_signal(int sig, void *regs0, siginfo_t *info)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if ((__sigmask(sig) & thread->sigmask.__val[0])) {
|
if ((__sigmask(sig) & thread->sigmask.__val[0])) {
|
||||||
coredump(thread, regs0);
|
coredump(thread, regs0, sig);
|
||||||
terminate(0, sig | 0x80);
|
terminate(0, sig | 0x80);
|
||||||
}
|
}
|
||||||
do_kill(thread, thread->proc->pid, thread->tid, sig, info, 0);
|
do_kill(thread, thread->proc->pid, thread->tid, sig, info, 0);
|
||||||
@@ -1629,7 +1677,7 @@ SYSCALL_DECLARE(mmap)
|
|||||||
;
|
;
|
||||||
|
|
||||||
const uintptr_t addr0 = ihk_mc_syscall_arg0(ctx);
|
const uintptr_t addr0 = ihk_mc_syscall_arg0(ctx);
|
||||||
const size_t len0 = ihk_mc_syscall_arg1(ctx);
|
size_t len0 = ihk_mc_syscall_arg1(ctx);
|
||||||
const int prot = ihk_mc_syscall_arg2(ctx);
|
const int prot = ihk_mc_syscall_arg2(ctx);
|
||||||
const int flags0 = ihk_mc_syscall_arg3(ctx);
|
const int flags0 = ihk_mc_syscall_arg3(ctx);
|
||||||
const int fd = ihk_mc_syscall_arg4(ctx);
|
const int fd = ihk_mc_syscall_arg4(ctx);
|
||||||
@@ -1668,7 +1716,9 @@ SYSCALL_DECLARE(mmap)
|
|||||||
if (flags & MAP_HUGETLB) {
|
if (flags & MAP_HUGETLB) {
|
||||||
switch (flags & (0x3F << MAP_HUGE_SHIFT)) {
|
switch (flags & (0x3F << MAP_HUGE_SHIFT)) {
|
||||||
case 0:
|
case 0:
|
||||||
flags |= MAP_HUGE_2MB; /* default hugepage size */
|
/* default hugepage size */
|
||||||
|
flags |= ihk_mc_get_linux_default_huge_page_shift() <<
|
||||||
|
MAP_HUGE_SHIFT;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case MAP_HUGE_2MB:
|
case MAP_HUGE_2MB:
|
||||||
@@ -1684,16 +1734,29 @@ SYSCALL_DECLARE(mmap)
|
|||||||
}
|
}
|
||||||
|
|
||||||
pgsize = (size_t)1 << ((flags >> MAP_HUGE_SHIFT) & 0x3F);
|
pgsize = (size_t)1 << ((flags >> MAP_HUGE_SHIFT) & 0x3F);
|
||||||
|
/* Round-up map length by pagesize */
|
||||||
|
len0 = ALIGN(len0, pgsize);
|
||||||
|
|
||||||
|
if (rusage_check_overmap(len0,
|
||||||
|
(flags >> MAP_HUGE_SHIFT) & 0x3F)) {
|
||||||
|
error = -ENOMEM;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define VALID_DUMMY_ADDR ((region->user_start + PTL3_SIZE - 1) & ~(PTL3_SIZE - 1))
|
#define VALID_DUMMY_ADDR ((region->user_start + PTL3_SIZE - 1) & ~(PTL3_SIZE - 1))
|
||||||
addr = (flags & MAP_FIXED)? addr0: VALID_DUMMY_ADDR;
|
addr = addr0;
|
||||||
len = (len0 + pgsize - 1) & ~(pgsize - 1);
|
len = (len0 + pgsize - 1) & ~(pgsize - 1);
|
||||||
|
recheck:
|
||||||
if ((addr & (pgsize - 1))
|
if ((addr & (pgsize - 1))
|
||||||
|| (len == 0)
|
|| (len == 0)
|
||||||
|| !(flags & (MAP_SHARED | MAP_PRIVATE))
|
|| !(flags & (MAP_SHARED | MAP_PRIVATE))
|
||||||
|| ((flags & MAP_SHARED) && (flags & MAP_PRIVATE))
|
|| ((flags & MAP_SHARED) && (flags & MAP_PRIVATE))
|
||||||
|| (off0 & (pgsize - 1))) {
|
|| (off0 & (pgsize - 1))) {
|
||||||
|
if (!(flags & MAP_FIXED) && addr != VALID_DUMMY_ADDR) {
|
||||||
|
addr = VALID_DUMMY_ADDR;
|
||||||
|
goto recheck;
|
||||||
|
}
|
||||||
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):EINVAL\n",
|
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):EINVAL\n",
|
||||||
addr0, len0, prot, flags0, fd, off0);
|
addr0, len0, prot, flags0, fd, off0);
|
||||||
error = -EINVAL;
|
error = -EINVAL;
|
||||||
@@ -1703,6 +1766,10 @@ SYSCALL_DECLARE(mmap)
|
|||||||
if (addr < region->user_start
|
if (addr < region->user_start
|
||||||
|| region->user_end <= addr
|
|| region->user_end <= addr
|
||||||
|| len > (region->user_end - region->user_start)) {
|
|| len > (region->user_end - region->user_start)) {
|
||||||
|
if (!(flags & MAP_FIXED) && addr != VALID_DUMMY_ADDR) {
|
||||||
|
addr = VALID_DUMMY_ADDR;
|
||||||
|
goto recheck;
|
||||||
|
}
|
||||||
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):ENOMEM\n",
|
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):ENOMEM\n",
|
||||||
addr0, len0, prot, flags0, fd, off0);
|
addr0, len0, prot, flags0, fd, off0);
|
||||||
error = -ENOMEM;
|
error = -ENOMEM;
|
||||||
@@ -1719,7 +1786,7 @@ SYSCALL_DECLARE(mmap)
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
addr = do_mmap(addr, len, prot, flags, fd, off0);
|
addr = do_mmap(addr, len, prot, flags, fd, off0, 0, NULL);
|
||||||
|
|
||||||
error = 0;
|
error = 0;
|
||||||
out:
|
out:
|
||||||
@@ -1730,10 +1797,20 @@ out:
|
|||||||
|
|
||||||
SYSCALL_DECLARE(clone)
|
SYSCALL_DECLARE(clone)
|
||||||
{
|
{
|
||||||
return do_fork((int)ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx),
|
struct process *proc = cpu_local_var(current)->proc;
|
||||||
|
struct mcs_rwlock_node_irqsave lock_dump;
|
||||||
|
unsigned long ret;
|
||||||
|
|
||||||
|
/* mutex coredump */
|
||||||
|
mcs_rwlock_reader_lock(&proc->coredump_lock, &lock_dump);
|
||||||
|
|
||||||
|
ret = do_fork((int)ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx),
|
||||||
ihk_mc_syscall_arg2(ctx), ihk_mc_syscall_arg3(ctx),
|
ihk_mc_syscall_arg2(ctx), ihk_mc_syscall_arg3(ctx),
|
||||||
ihk_mc_syscall_arg4(ctx), ihk_mc_syscall_pc(ctx),
|
ihk_mc_syscall_arg4(ctx), ihk_mc_syscall_pc(ctx),
|
||||||
ihk_mc_syscall_sp(ctx));
|
ihk_mc_syscall_sp(ctx));
|
||||||
|
|
||||||
|
mcs_rwlock_reader_unlock(&proc->coredump_lock, &lock_dump);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
SYSCALL_DECLARE(fork)
|
SYSCALL_DECLARE(fork)
|
||||||
@@ -1761,7 +1838,9 @@ SYSCALL_DECLARE(shmget)
|
|||||||
int hugeshift = shmflg & (0x3F << SHM_HUGE_SHIFT);
|
int hugeshift = shmflg & (0x3F << SHM_HUGE_SHIFT);
|
||||||
|
|
||||||
if (hugeshift == 0) {
|
if (hugeshift == 0) {
|
||||||
shmflg |= SHM_HUGE_2MB; /* default hugepage size */
|
/* default hugepage size */
|
||||||
|
shmflg |= ihk_mc_get_linux_default_huge_page_shift() <<
|
||||||
|
MAP_HUGE_SHIFT;
|
||||||
} else if (hugeshift == SHM_HUGE_2MB ||
|
} else if (hugeshift == SHM_HUGE_2MB ||
|
||||||
hugeshift == SHM_HUGE_1GB) {
|
hugeshift == SHM_HUGE_1GB) {
|
||||||
/*nop*/
|
/*nop*/
|
||||||
@@ -2036,7 +2115,7 @@ int arch_map_vdso(struct process_vm *vm)
|
|||||||
vrflags |= VR_PROT_READ | VR_PROT_EXEC;
|
vrflags |= VR_PROT_READ | VR_PROT_EXEC;
|
||||||
vrflags |= VRFLAG_PROT_TO_MAXPROT(vrflags);
|
vrflags |= VRFLAG_PROT_TO_MAXPROT(vrflags);
|
||||||
error = add_process_memory_range(vm, (intptr_t)s, (intptr_t)e,
|
error = add_process_memory_range(vm, (intptr_t)s, (intptr_t)e,
|
||||||
NOPHYS, vrflags, NULL, 0, PAGE_SHIFT, &range);
|
NOPHYS, vrflags, NULL, 0, PAGE_SHIFT, NULL, &range);
|
||||||
if (error) {
|
if (error) {
|
||||||
ekprintf("ERROR: adding memory range for vdso. %d\n", error);
|
ekprintf("ERROR: adding memory range for vdso. %d\n", error);
|
||||||
goto out;
|
goto out;
|
||||||
@@ -2068,7 +2147,8 @@ int arch_map_vdso(struct process_vm *vm)
|
|||||||
vrflags |= VR_PROT_READ;
|
vrflags |= VR_PROT_READ;
|
||||||
vrflags |= VRFLAG_PROT_TO_MAXPROT(vrflags);
|
vrflags |= VRFLAG_PROT_TO_MAXPROT(vrflags);
|
||||||
error = add_process_memory_range(vm, (intptr_t)s, (intptr_t)e,
|
error = add_process_memory_range(vm, (intptr_t)s, (intptr_t)e,
|
||||||
NOPHYS, vrflags, NULL, 0, PAGE_SHIFT, &range);
|
NOPHYS, vrflags, NULL, 0,
|
||||||
|
PAGE_SHIFT, NULL, &range);
|
||||||
if (error) {
|
if (error) {
|
||||||
ekprintf("ERROR: adding memory range for vvar. %d\n", error);
|
ekprintf("ERROR: adding memory range for vvar. %d\n", error);
|
||||||
goto out;
|
goto out;
|
||||||
@@ -2811,4 +2891,46 @@ time_t time(void) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void calculate_time_from_tsc(struct timespec *ts)
|
||||||
|
{
|
||||||
|
unsigned long seq;
|
||||||
|
unsigned long seq2;
|
||||||
|
unsigned long ns;
|
||||||
|
unsigned long delta;
|
||||||
|
struct vsyscall_gtod_data *gtod = vdso.vgtod_virt;
|
||||||
|
|
||||||
|
do {
|
||||||
|
for (;;) {
|
||||||
|
seq = ACCESS_ONCE(gtod->seq);
|
||||||
|
if (unlikely(seq & 1)) {
|
||||||
|
cpu_pause();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
rmb(); /* fetch sequence before time */
|
||||||
|
ts->tv_sec = gtod->wall_time_sec;
|
||||||
|
ns = gtod->wall_time_snsec;
|
||||||
|
delta = rdtsc() - gtod->clock.cycle_last;
|
||||||
|
ns += delta * gtod->clock.mult;
|
||||||
|
ns >>= gtod->clock.shift;
|
||||||
|
seq2 = ACCESS_ONCE(gtod->seq);
|
||||||
|
rmb(); /* fetch time before checking sequence */
|
||||||
|
} while (seq != seq2);
|
||||||
|
ts->tv_nsec = ns;
|
||||||
|
|
||||||
|
if (ts->tv_nsec >= NS_PER_SEC) {
|
||||||
|
ts->tv_nsec -= NS_PER_SEC;
|
||||||
|
++ts->tv_sec;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern void ptrace_syscall_event(struct thread *thread);
|
||||||
|
long arch_ptrace_syscall_event(struct thread *thread,
|
||||||
|
ihk_mc_user_context_t *ctx, long setret)
|
||||||
|
{
|
||||||
|
ihk_mc_syscall_ret(ctx) = setret;
|
||||||
|
ptrace_syscall_event(thread);
|
||||||
|
return ihk_mc_syscall_ret(ctx);
|
||||||
|
}
|
||||||
/*** End of File ***/
|
/*** End of File ***/
|
||||||
|
|||||||
@@ -1,10 +0,0 @@
|
|||||||
[Unit]
|
|
||||||
Description=irqbalance daemon
|
|
||||||
After=syslog.target
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
EnvironmentFile=/tmp/irqbalance_mck
|
|
||||||
ExecStart=/usr/sbin/irqbalance --foreground $IRQBALANCE_ARGS
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
@@ -1,150 +0,0 @@
|
|||||||
# mcoverlay-create-smp-x86.sh.in COPYRIGHT FUJITSU LIMITED 2018
|
|
||||||
# Overlay /proc, /sys with McKernel specific contents
|
|
||||||
|
|
||||||
#
|
|
||||||
# Revert any state that has been initialized before the error occured.
|
|
||||||
#
|
|
||||||
if [ -z "$(declare -f error_exit)" ]; then
|
|
||||||
error_exit() {
|
|
||||||
local status=$1
|
|
||||||
|
|
||||||
case $status in
|
|
||||||
mcos_sys_mounted)
|
|
||||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
|
||||||
umount /tmp/mcos/mcos0_sys
|
|
||||||
fi
|
|
||||||
;&
|
|
||||||
mcos_proc_mounted)
|
|
||||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
|
||||||
umount /tmp/mcos/mcos0_proc
|
|
||||||
fi
|
|
||||||
;&
|
|
||||||
mcoverlayfs_loaded)
|
|
||||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
|
||||||
rmmod mcoverlay 2>/dev/null
|
|
||||||
fi
|
|
||||||
;&
|
|
||||||
linux_proc_bind_mounted)
|
|
||||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
|
||||||
umount /tmp/mcos/linux_proc
|
|
||||||
fi
|
|
||||||
;&
|
|
||||||
tmp_mcos_mounted)
|
|
||||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
|
||||||
umount /tmp/mcos
|
|
||||||
fi
|
|
||||||
;&
|
|
||||||
tmp_mcos_created)
|
|
||||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
|
||||||
rm -rf /tmp/mcos
|
|
||||||
fi
|
|
||||||
;&
|
|
||||||
initial)
|
|
||||||
# Nothing more to revert
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
# Retun -EINVAL
|
|
||||||
exit -22
|
|
||||||
}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -e /tmp/mcos ]; then
|
|
||||||
mkdir -p /tmp/mcos;
|
|
||||||
fi
|
|
||||||
if ! mount -t tmpfs tmpfs /tmp/mcos; then
|
|
||||||
echo "error: mount /tmp/mcos" >&2
|
|
||||||
error_exit "tmp_mcos_created"
|
|
||||||
fi
|
|
||||||
if [ ! -e /tmp/mcos/linux_proc ]; then
|
|
||||||
mkdir -p /tmp/mcos/linux_proc;
|
|
||||||
fi
|
|
||||||
if ! mount --bind /proc /tmp/mcos/linux_proc; then
|
|
||||||
echo "error: mount /tmp/mcos/linux_proc" >&2
|
|
||||||
error_exit "tmp_mcos_mounted"
|
|
||||||
fi
|
|
||||||
if ! taskset -c 0 insmod @KMODDIR@/mcoverlay.ko 2>/dev/null; then
|
|
||||||
echo "error: inserting mcoverlay.ko" >&2
|
|
||||||
error_exit "linux_proc_bind_mounted"
|
|
||||||
fi
|
|
||||||
while [ ! -e /proc/mcos0 ]
|
|
||||||
do
|
|
||||||
sleep 0.1
|
|
||||||
done
|
|
||||||
if [ ! -e /tmp/mcos/mcos0_proc ]; then
|
|
||||||
mkdir -p /tmp/mcos/mcos0_proc;
|
|
||||||
fi
|
|
||||||
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then
|
|
||||||
mkdir -p /tmp/mcos/mcos0_proc_upper;
|
|
||||||
fi
|
|
||||||
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then
|
|
||||||
mkdir -p /tmp/mcos/mcos0_proc_work;
|
|
||||||
fi
|
|
||||||
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then
|
|
||||||
echo "error: mounting /tmp/mcos/mcos0_proc" >&2
|
|
||||||
error_exit "mcoverlayfs_loaded"
|
|
||||||
fi
|
|
||||||
# TODO: How de we revert this in case of failure??
|
|
||||||
mount --make-rprivate /proc
|
|
||||||
|
|
||||||
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys/setup_complete ]
|
|
||||||
do
|
|
||||||
sleep 0.1
|
|
||||||
done
|
|
||||||
if [ ! -e /tmp/mcos/mcos0_sys ]; then
|
|
||||||
mkdir -p /tmp/mcos/mcos0_sys;
|
|
||||||
fi
|
|
||||||
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then
|
|
||||||
mkdir -p /tmp/mcos/mcos0_sys_upper;
|
|
||||||
fi
|
|
||||||
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then
|
|
||||||
mkdir -p /tmp/mcos/mcos0_sys_work;
|
|
||||||
fi
|
|
||||||
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then
|
|
||||||
echo "error: mount /tmp/mcos/mcos0_sys" >&2
|
|
||||||
error_exit "mcos_proc_mounted"
|
|
||||||
fi
|
|
||||||
# TODO: How de we revert this in case of failure??
|
|
||||||
mount --make-rprivate /sys
|
|
||||||
|
|
||||||
touch /tmp/mcos/mcos0_proc/mckernel
|
|
||||||
|
|
||||||
rm -rf /tmp/mcos/mcos0_sys/setup_complete
|
|
||||||
|
|
||||||
# Hide NUMA related files which are outside the LWK partition
|
|
||||||
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
|
|
||||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
|
|
||||||
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
|
|
||||||
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
|
|
||||||
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/drivers/processor/$cpuid
|
|
||||||
else
|
|
||||||
for nodeid in `find /sys/devices/system/cpu/$cpuid/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
|
|
||||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid/$nodeid" ]; then
|
|
||||||
rm -f /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid/$nodeid
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
for nodeid in `find /sys/devices/system/node/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
|
|
||||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid" ]; then
|
|
||||||
rm -rf /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/*
|
|
||||||
rm -rf /tmp/mcos/mcos0_sys/bus/node/devices/$nodeid
|
|
||||||
else
|
|
||||||
# Delete non-existent symlinks
|
|
||||||
for cpuid in `find /sys/devices/system/node/$nodeid/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
|
|
||||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid/$cpuid" ]; then
|
|
||||||
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/$cpuid
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/memory*
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
rm -f /tmp/mcos/mcos0_sys/devices/system/node/has_*
|
|
||||||
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
|
|
||||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
|
|
||||||
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
# Remove mcoverlay if loaded
|
|
||||||
|
|
||||||
if grep mcoverlay /proc/modules &>/dev/null; then
|
|
||||||
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
|
|
||||||
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
|
|
||||||
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
|
|
||||||
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
|
|
||||||
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
|
|
||||||
if ! rmmod mcoverlay 2>/dev/null; then
|
|
||||||
echo "error: removing mcoverlay" >&2
|
|
||||||
# Return -EINVAL
|
|
||||||
exit -22
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
@@ -14,6 +14,28 @@ mark_as_advanced(
|
|||||||
KBUILD_MAKE_FLAGS
|
KBUILD_MAKE_FLAGS
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if (${CMAKE_GENERATOR} STREQUAL Ninja)
|
||||||
|
set(MAKE "make")
|
||||||
|
list(APPEND KBUILD_MAKE_FLAGS "-j")
|
||||||
|
else ()
|
||||||
|
set(MAKE "$(MAKE)")
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
# Convert McKernel "arm64" into Linux "aarch64"
|
||||||
|
if ("${ARCH}" STREQUAL "arm64")
|
||||||
|
set(LINUX_ARCH "aarch64")
|
||||||
|
else ()
|
||||||
|
set(LINUX_ARCH "${ARCH}")
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
if (NOT "${LINUX_ARCH}" STREQUAL "${CMAKE_HOST_SYSTEM_PROCESSOR}")
|
||||||
|
string(REGEX REPLACE "ld$" "" CROSS_COMPILE "${CMAKE_LINKER}")
|
||||||
|
list(APPEND KBUILD_MAKE_FLAGS "ARCH=${ARCH}")
|
||||||
|
list(APPEND KBUILD_MAKE_FLAGS "CROSS_COMPILE=${CROSS_COMPILE}")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
string(REPLACE ";" " " KBUILD_MAKE_FLAGS_STR "${KBUILD_MAKE_FLAGS}")
|
||||||
|
|
||||||
function(kmod MODULE_NAME)
|
function(kmod MODULE_NAME)
|
||||||
cmake_parse_arguments(KMOD "" "INSTALL_DEST" "C_FLAGS;SOURCES;EXTRA_SYMBOLS;DEPENDS" ${ARGN})
|
cmake_parse_arguments(KMOD "" "INSTALL_DEST" "C_FLAGS;SOURCES;EXTRA_SYMBOLS;DEPENDS" ${ARGN})
|
||||||
|
|
||||||
@@ -33,17 +55,6 @@ endif(ENABLE_WERROR)
|
|||||||
configure_file(${CMAKE_SOURCE_DIR}/cmake/modules/Kbuild.in
|
configure_file(${CMAKE_SOURCE_DIR}/cmake/modules/Kbuild.in
|
||||||
${CMAKE_CURRENT_BINARY_DIR}/Kbuild)
|
${CMAKE_CURRENT_BINARY_DIR}/Kbuild)
|
||||||
|
|
||||||
if (${CMAKE_GENERATOR} STREQUAL Ninja)
|
|
||||||
set(MAKE "make")
|
|
||||||
list(APPEND KBUILD_MAKE_FLAGS "-j")
|
|
||||||
else ()
|
|
||||||
set(MAKE "$(MAKE)")
|
|
||||||
endif ()
|
|
||||||
if (NOT "${ARCH}" STREQUAL "${CMAKE_HOST_SYSTEM_PROCESSOR}")
|
|
||||||
string(REGEX REPLACE "ld$" "" CROSS_COMPILE "${CMAKE_LINKER}")
|
|
||||||
list(APPEND KBUILD_MAKE_FLAGS "ARCH=${ARCH};CROSS_COMPILE=${CROSS_COMPILE}")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
string(REGEX REPLACE "\\.c(;|$)" ".o.cmd\\1" KMOD_O_CMD "${KMOD_SOURCES}")
|
string(REGEX REPLACE "\\.c(;|$)" ".o.cmd\\1" KMOD_O_CMD "${KMOD_SOURCES}")
|
||||||
string(REGEX REPLACE "[^/;]+(;|$)" ".\\0" KMOD_O_CMD "${KMOD_O_CMD}")
|
string(REGEX REPLACE "[^/;]+(;|$)" ".\\0" KMOD_O_CMD "${KMOD_O_CMD}")
|
||||||
|
|
||||||
@@ -78,6 +89,10 @@ endif(ENABLE_WERROR)
|
|||||||
# the native build system do these checks, if possible at all...
|
# the native build system do these checks, if possible at all...
|
||||||
add_custom_command(OUTPUT kmod_always_rebuild COMMAND touch kmod_always_rebuild)
|
add_custom_command(OUTPUT kmod_always_rebuild COMMAND touch kmod_always_rebuild)
|
||||||
|
|
||||||
|
if (NOT EXISTS "${KERNEL_DIR}/Makefile")
|
||||||
|
message(FATAL_ERROR "${KERNEL_DIR} does not contain a Makefile and is probably missing. install kernel development package or set the KERNEL_DIR variable")
|
||||||
|
endif()
|
||||||
|
|
||||||
add_custom_command(
|
add_custom_command(
|
||||||
OUTPUT "${MODULE_NAME}.ko"
|
OUTPUT "${MODULE_NAME}.ko"
|
||||||
"Module.symvers"
|
"Module.symvers"
|
||||||
|
|||||||
17
config.h.in
17
config.h.in
@@ -6,8 +6,9 @@
|
|||||||
/* version number */
|
/* version number */
|
||||||
#define MCKERNEL_VERSION "${MCKERNEL_VERSION}"
|
#define MCKERNEL_VERSION "${MCKERNEL_VERSION}"
|
||||||
|
|
||||||
/* whether mcoverlayfs is enabled */
|
/* enable the required code for mcexec to be able to use bind mount
|
||||||
#cmakedefine ENABLE_MCOVERLAYFS 1
|
* there is no config option as its use is discouraged */
|
||||||
|
// #define MCEXEC_BIND_MOUNT 1
|
||||||
|
|
||||||
/* whether memdump feature is enabled */
|
/* whether memdump feature is enabled */
|
||||||
#cmakedefine ENABLE_MEMDUMP 1
|
#cmakedefine ENABLE_MEMDUMP 1
|
||||||
@@ -27,18 +28,12 @@
|
|||||||
/* whether undefined behaviour sanitizer is enabled */
|
/* whether undefined behaviour sanitizer is enabled */
|
||||||
#cmakedefine ENABLE_UBSAN 1
|
#cmakedefine ENABLE_UBSAN 1
|
||||||
|
|
||||||
|
/* whether per-CPU allocator cache (ThunderX2 workaround) is enabled */
|
||||||
|
#cmakedefine ENABLE_PER_CPU_ALLOC_CACHE 1
|
||||||
|
|
||||||
/* Path of bind-mount source directory */
|
/* Path of bind-mount source directory */
|
||||||
#cmakedefine ROOTFSDIR "${ROOTFSDIR}"
|
#cmakedefine ROOTFSDIR "${ROOTFSDIR}"
|
||||||
|
|
||||||
/* Path of install directory for libraries */
|
|
||||||
#cmakedefine MCKERNEL_LIBDIR "${MCKERNEL_LIBDIR}"
|
|
||||||
|
|
||||||
/* Path of install directory for binary */
|
|
||||||
#cmakedefine BINDIR "${BINDIR}"
|
|
||||||
|
|
||||||
/* Path of install directory for system binary */
|
|
||||||
#cmakedefine SBINDIR "${SBINDIR}"
|
|
||||||
|
|
||||||
/* for non-RHEL kernels */
|
/* for non-RHEL kernels */
|
||||||
#ifndef RHEL_RELEASE_VERSION
|
#ifndef RHEL_RELEASE_VERSION
|
||||||
#define RHEL_RELEASE_VERSION(a,b) (((a) << 8) + (b))
|
#define RHEL_RELEASE_VERSION(a,b) (((a) << 8) + (b))
|
||||||
|
|||||||
@@ -1,27 +0,0 @@
|
|||||||
#ifndef IHKLIB_RUSAGE_H_INCLUDED
|
|
||||||
#define IHKLIB_RUSAGE_H_INCLUDED
|
|
||||||
|
|
||||||
#define IHK_MAX_NUM_PGSIZES 4
|
|
||||||
#define IHK_MAX_NUM_NUMA_NODES 1024
|
|
||||||
#define IHK_MAX_NUM_CPUS 1024
|
|
||||||
|
|
||||||
#define IHK_OS_PGSIZE_4KB 0
|
|
||||||
#define IHK_OS_PGSIZE_2MB 1
|
|
||||||
#define IHK_OS_PGSIZE_1GB 2
|
|
||||||
|
|
||||||
struct mckernel_rusage {
|
|
||||||
unsigned long memory_stat_rss[IHK_MAX_NUM_PGSIZES];
|
|
||||||
unsigned long memory_stat_mapped_file[IHK_MAX_NUM_PGSIZES];
|
|
||||||
unsigned long memory_max_usage;
|
|
||||||
unsigned long memory_kmem_usage;
|
|
||||||
unsigned long memory_kmem_max_usage;
|
|
||||||
unsigned long memory_numa_stat[IHK_MAX_NUM_NUMA_NODES];
|
|
||||||
unsigned long cpuacct_stat_system;
|
|
||||||
unsigned long cpuacct_stat_user;
|
|
||||||
unsigned long cpuacct_usage;
|
|
||||||
unsigned long cpuacct_usage_percpu[IHK_MAX_NUM_CPUS];
|
|
||||||
int num_threads;
|
|
||||||
int max_num_threads;
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif /* !defined(IHKLIB_RUSAGE_H_INCLUDED) */
|
|
||||||
@@ -55,7 +55,7 @@
|
|||||||
#define MCEXEC_UP_SYS_UNSHARE 0x30a02916
|
#define MCEXEC_UP_SYS_UNSHARE 0x30a02916
|
||||||
|
|
||||||
#define MCEXEC_UP_UTI_GET_CTX 0x30a02920
|
#define MCEXEC_UP_UTI_GET_CTX 0x30a02920
|
||||||
#define MCEXEC_UP_UTI_SAVE_FS 0x30a02921
|
#define MCEXEC_UP_UTI_SWITCH_CTX 0x30a02921
|
||||||
#define MCEXEC_UP_SIG_THREAD 0x30a02922
|
#define MCEXEC_UP_SIG_THREAD 0x30a02922
|
||||||
#define MCEXEC_UP_SYSCALL_THREAD 0x30a02924
|
#define MCEXEC_UP_SYSCALL_THREAD 0x30a02924
|
||||||
#define MCEXEC_UP_TERMINATE_THREAD 0x30a02925
|
#define MCEXEC_UP_TERMINATE_THREAD 0x30a02925
|
||||||
@@ -92,6 +92,7 @@ struct program_image_section {
|
|||||||
struct get_cpu_set_arg {
|
struct get_cpu_set_arg {
|
||||||
int nr_processes;
|
int nr_processes;
|
||||||
int *process_rank;
|
int *process_rank;
|
||||||
|
pid_t ppid;
|
||||||
void *cpu_set;
|
void *cpu_set;
|
||||||
size_t cpu_set_size; // Size in bytes
|
size_t cpu_set_size; // Size in bytes
|
||||||
int *target_core;
|
int *target_core;
|
||||||
@@ -193,7 +194,6 @@ struct syscall_response {
|
|||||||
unsigned long req_thread_status;
|
unsigned long req_thread_status;
|
||||||
long ret;
|
long ret;
|
||||||
unsigned long fault_address;
|
unsigned long fault_address;
|
||||||
unsigned long fault_reason;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct syscall_ret_desc {
|
struct syscall_ret_desc {
|
||||||
@@ -359,7 +359,7 @@ struct uti_get_ctx_desc {
|
|||||||
unsigned long key; /* OUT: struct task_struct* of mcexec thread, used to search struct host_thread */
|
unsigned long key; /* OUT: struct task_struct* of mcexec thread, used to search struct host_thread */
|
||||||
};
|
};
|
||||||
|
|
||||||
struct uti_save_fs_desc {
|
struct uti_switch_ctx_desc {
|
||||||
void *rctx; /* Remote context */
|
void *rctx; /* Remote context */
|
||||||
void *lctx; /* Local context */
|
void *lctx; /* Local context */
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ if(ARCH STREQUAL "x86_64")
|
|||||||
set(ARCH_C_FLAGS "-mno-red-zone -mcmodel=kernel")
|
set(ARCH_C_FLAGS "-mno-red-zone -mcmodel=kernel")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
set(MCEXEC_PATH "${CMAKE_INSTALL_FULL_BINDIR}/mcexec" CACHE STRING "mcexec path for binfmt")
|
||||||
|
|
||||||
kmod(mcctrl
|
kmod(mcctrl
|
||||||
C_FLAGS
|
C_FLAGS
|
||||||
-I${IHK_FULL_SOURCE_DIR}/linux/include
|
-I${IHK_FULL_SOURCE_DIR}/linux/include
|
||||||
@@ -16,7 +18,7 @@ kmod(mcctrl
|
|||||||
-I${CMAKE_CURRENT_SOURCE_DIR}/arch/${ARCH}/include
|
-I${CMAKE_CURRENT_SOURCE_DIR}/arch/${ARCH}/include
|
||||||
-I${PROJECT_BINARY_DIR}
|
-I${PROJECT_BINARY_DIR}
|
||||||
-I${PROJECT_SOURCE_DIR}/kernel/include
|
-I${PROJECT_SOURCE_DIR}/kernel/include
|
||||||
-DMCEXEC_PATH=\\"${CMAKE_INSTALL_FULL_BINDIR}/mcexec\\"
|
-DMCEXEC_PATH=\\"${MCEXEC_PATH}\\"
|
||||||
${ARCH_C_FLAGS}
|
${ARCH_C_FLAGS}
|
||||||
SOURCES
|
SOURCES
|
||||||
driver.c control.c ikc.c syscall.c procfs.c binfmt_mcexec.c
|
driver.c control.c ikc.c syscall.c procfs.c binfmt_mcexec.c
|
||||||
|
|||||||
@@ -1,7 +1,12 @@
|
|||||||
/* archdeps.c COPYRIGHT FUJITSU LIMITED 2016 */
|
/* archdeps.c COPYRIGHT FUJITSU LIMITED 2016-2019 */
|
||||||
#include <linux/version.h>
|
#include <linux/version.h>
|
||||||
#include <linux/mm_types.h>
|
#include <linux/mm_types.h>
|
||||||
#include <linux/kallsyms.h>
|
#include <linux/kallsyms.h>
|
||||||
|
#if KERNEL_VERSION(4, 11, 0) <= LINUX_VERSION_CODE
|
||||||
|
#include <linux/sched/task_stack.h>
|
||||||
|
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) */
|
||||||
|
#include <linux/ptrace.h>
|
||||||
|
#include <linux/uaccess.h>
|
||||||
#include <asm/vdso.h>
|
#include <asm/vdso.h>
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
#include "../../mcctrl.h"
|
#include "../../mcctrl.h"
|
||||||
@@ -42,7 +47,6 @@ int arch_symbols_init(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_52
|
|
||||||
#define VDSO_MAXPAGES 1
|
#define VDSO_MAXPAGES 1
|
||||||
struct vdso {
|
struct vdso {
|
||||||
long busy;
|
long busy;
|
||||||
@@ -53,7 +57,6 @@ struct vdso {
|
|||||||
long lbase;
|
long lbase;
|
||||||
long offset_sigtramp;
|
long offset_sigtramp;
|
||||||
};
|
};
|
||||||
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
|
|
||||||
|
|
||||||
unsigned long
|
unsigned long
|
||||||
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end);
|
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end);
|
||||||
@@ -95,6 +98,74 @@ reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsign
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if KERNEL_VERSION(4, 0, 0) <= LINUX_VERSION_CODE
|
||||||
|
static long elf_search_vdso_sigtramp(void)
|
||||||
|
{
|
||||||
|
int i = 0;
|
||||||
|
long ans = -1;
|
||||||
|
char *shstr = NULL, *dynstr = NULL;
|
||||||
|
Elf64_Ehdr *eh = NULL;
|
||||||
|
Elf64_Shdr *tmp_sh = NULL, *sym_sh = NULL;
|
||||||
|
Elf64_Sym *sym = NULL;
|
||||||
|
|
||||||
|
/* ELF header */
|
||||||
|
eh = (Elf64_Ehdr *)vdso_start;
|
||||||
|
if (eh == NULL) {
|
||||||
|
D("vdso_start is NULL.\n");
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ELF magic check */
|
||||||
|
if (eh->e_ident[EI_MAG0] != ELFMAG0 &&
|
||||||
|
eh->e_ident[EI_MAG1] != ELFMAG1 &&
|
||||||
|
eh->e_ident[EI_MAG2] != ELFMAG2 &&
|
||||||
|
eh->e_ident[EI_MAG3] != ELFMAG3) {
|
||||||
|
D("vdso_start ELF MAGIC Mismatch.\n"
|
||||||
|
"e_ident[EI_MAG0 - EI_MAG3]: %02x %02x %02x %02x\n",
|
||||||
|
eh->e_ident[EI_MAG0], eh->e_ident[EI_MAG1],
|
||||||
|
eh->e_ident[EI_MAG2], eh->e_ident[EI_MAG3]);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Search dynsym-table and dynstr-table offset
|
||||||
|
* from section header table
|
||||||
|
*/
|
||||||
|
tmp_sh = (Elf64_Shdr *)(vdso_start + eh->e_shoff);
|
||||||
|
shstr = vdso_start + (tmp_sh + eh->e_shstrndx)->sh_offset;
|
||||||
|
for (i = 0; i < eh->e_shnum; i++, tmp_sh++) {
|
||||||
|
if (tmp_sh->sh_type == SHT_DYNSYM) {
|
||||||
|
sym_sh = tmp_sh;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tmp_sh->sh_type == SHT_STRTAB &&
|
||||||
|
!strcmp(&shstr[tmp_sh->sh_name], ".dynstr")) {
|
||||||
|
dynstr = vdso_start + tmp_sh->sh_offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sym_sh == NULL) {
|
||||||
|
D("dynsym-table not found.\n");
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dynstr == 0) {
|
||||||
|
D("dynstr-table not found.\n");
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Search __kernel_rt_sigreturn offset from dynsym-table */
|
||||||
|
sym = (Elf64_Sym *)(vdso_start + sym_sh->sh_offset);
|
||||||
|
for (i = 0; (i * sym_sh->sh_entsize) < sym_sh->sh_size; i++, sym++) {
|
||||||
|
if (!strcmp(dynstr + sym->st_name, "__kernel_rt_sigreturn")) {
|
||||||
|
ans = sym->st_value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out:
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
#endif /*LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)*/
|
||||||
|
|
||||||
void get_vdso_info(ihk_os_t os, long vdso_rpa)
|
void get_vdso_info(ihk_os_t os, long vdso_rpa)
|
||||||
{
|
{
|
||||||
ihk_device_t dev = ihk_os_to_dev(os);
|
ihk_device_t dev = ihk_os_to_dev(os);
|
||||||
@@ -128,7 +199,12 @@ void get_vdso_info(ihk_os_t os, long vdso_rpa)
|
|||||||
|
|
||||||
/* offsets */
|
/* offsets */
|
||||||
vdso->lbase = VDSO_LBASE;
|
vdso->lbase = VDSO_LBASE;
|
||||||
vdso->offset_sigtramp = vdso_offset_sigtramp;
|
vdso->offset_sigtramp = elf_search_vdso_sigtramp();
|
||||||
|
|
||||||
|
if (unlikely(vdso->offset_sigtramp == -1)) {
|
||||||
|
D("Use vdso_offset_sigtramp in header-file.\n");
|
||||||
|
vdso->offset_sigtramp = vdso_offset_sigtramp;
|
||||||
|
}
|
||||||
#endif /*LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)*/
|
#endif /*LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)*/
|
||||||
out:
|
out:
|
||||||
wmb();
|
wmb();
|
||||||
@@ -142,59 +218,61 @@ out:
|
|||||||
void *
|
void *
|
||||||
get_user_sp(void)
|
get_user_sp(void)
|
||||||
{
|
{
|
||||||
/* TODO; skeleton for UTI */
|
return (void *)current_pt_regs()->sp;
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
set_user_sp(void *usp)
|
set_user_sp(void *usp)
|
||||||
{
|
{
|
||||||
/* TODO; skeleton for UTI */
|
current_pt_regs()->sp = (unsigned long)usp;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* TODO; skeleton for UTI */
|
|
||||||
struct trans_uctx {
|
struct trans_uctx {
|
||||||
volatile int cond;
|
volatile int cond;
|
||||||
int fregsize;
|
int fregsize;
|
||||||
|
struct user_pt_regs regs;
|
||||||
unsigned long rax;
|
unsigned long tls_baseaddr;
|
||||||
unsigned long rbx;
|
|
||||||
unsigned long rcx;
|
|
||||||
unsigned long rdx;
|
|
||||||
unsigned long rsi;
|
|
||||||
unsigned long rdi;
|
|
||||||
unsigned long rbp;
|
|
||||||
unsigned long r8;
|
|
||||||
unsigned long r9;
|
|
||||||
unsigned long r10;
|
|
||||||
unsigned long r11;
|
|
||||||
unsigned long r12;
|
|
||||||
unsigned long r13;
|
|
||||||
unsigned long r14;
|
|
||||||
unsigned long r15;
|
|
||||||
unsigned long rflags;
|
|
||||||
unsigned long rip;
|
|
||||||
unsigned long rsp;
|
|
||||||
unsigned long fs;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
void
|
void
|
||||||
restore_fs(unsigned long fs)
|
restore_tls(unsigned long addr)
|
||||||
{
|
{
|
||||||
/* TODO; skeleton for UTI */
|
const unsigned long tpidrro = 0;
|
||||||
|
|
||||||
|
asm volatile(
|
||||||
|
" msr tpidr_el0, %0\n"
|
||||||
|
" msr tpidrro_el0, %1"
|
||||||
|
: : "r" (addr), "r" (tpidrro));
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
save_fs_ctx(void *ctx)
|
save_tls_ctx(void __user *ctx)
|
||||||
{
|
{
|
||||||
/* TODO; skeleton for UTI */
|
struct trans_uctx __user *tctx = ctx;
|
||||||
|
unsigned long baseaddr;
|
||||||
|
|
||||||
|
asm volatile(
|
||||||
|
" mrs %0, tpidr_el0"
|
||||||
|
: "=r" (baseaddr));
|
||||||
|
|
||||||
|
if (copy_to_user(&tctx->tls_baseaddr, &baseaddr,
|
||||||
|
sizeof(tctx->tls_baseaddr))) {
|
||||||
|
pr_err("%s: copy_to_user failed.\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned long
|
unsigned long
|
||||||
get_fs_ctx(void *ctx)
|
get_tls_ctx(void __user *ctx)
|
||||||
{
|
{
|
||||||
/* TODO; skeleton for UTI */
|
struct trans_uctx __user *tctx = ctx;
|
||||||
return 0;
|
struct trans_uctx kctx;
|
||||||
|
|
||||||
|
if (copy_from_user(&kctx, tctx, sizeof(struct trans_uctx))) {
|
||||||
|
pr_err("%s: copy_from_user failed.\n", __func__);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return kctx.tls_baseaddr;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0)
|
#if LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0)
|
||||||
@@ -304,3 +382,38 @@ out:
|
|||||||
error, rva, rpa, pgsize);
|
error, rva, rpa, pgsize);
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Assembler switch_ctx executes only ioctl.
|
||||||
|
* Context register save/load is done on Linux (get from current_pt_regs).
|
||||||
|
* Do TLS save/load and register host_thread with ioctl.
|
||||||
|
*/
|
||||||
|
long arch_switch_ctx(struct uti_switch_ctx_desc *desc)
|
||||||
|
{
|
||||||
|
int rc = 0;
|
||||||
|
struct trans_uctx *__user rctx = NULL;
|
||||||
|
struct trans_uctx *__user lctx = NULL;
|
||||||
|
struct trans_uctx klctx = {
|
||||||
|
.regs = current_pt_regs()->user_regs,
|
||||||
|
};
|
||||||
|
|
||||||
|
rctx = desc->rctx;
|
||||||
|
lctx = desc->lctx;
|
||||||
|
|
||||||
|
if (copy_to_user(lctx, &klctx, sizeof(klctx))) {
|
||||||
|
pr_err("%s: Error: copy_to_user failed\n", __func__);
|
||||||
|
rc = -EFAULT;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (copy_from_user(¤t_pt_regs()->user_regs,
|
||||||
|
&rctx->regs, sizeof(rctx->regs))) {
|
||||||
|
pr_err("%s: Error: copy_from_user failed\n", __func__);
|
||||||
|
rc = -EFAULT;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
restore_tls(get_tls_ctx(rctx));
|
||||||
|
|
||||||
|
out:
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|||||||
@@ -9,8 +9,6 @@
|
|||||||
extern int translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long rva,
|
extern int translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long rva,
|
||||||
unsigned long *rpap, unsigned long *pgsizep);
|
unsigned long *rpap, unsigned long *pgsizep);
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_12
|
|
||||||
|
|
||||||
#define PFN_WRITE_COMBINED PTE_ATTRINDX(MT_NORMAL_NC)
|
#define PFN_WRITE_COMBINED PTE_ATTRINDX(MT_NORMAL_NC)
|
||||||
static inline bool pte_is_write_combined(pte_t pte)
|
static inline bool pte_is_write_combined(pte_t pte)
|
||||||
{
|
{
|
||||||
@@ -31,9 +29,8 @@ static inline bool pte_is_write_combined(pte_t pte)
|
|||||||
#endif
|
#endif
|
||||||
return ((pte_val(pte) & PTE_ATTRINDX_MASK) == PFN_WRITE_COMBINED);
|
return ((pte_val(pte) & PTE_ATTRINDX_MASK) == PFN_WRITE_COMBINED);
|
||||||
}
|
}
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
|
|
||||||
|
|
||||||
#define ARMV8_IDX_COUNTER0 1
|
#define ARMV8_IDX_COUNTER0 0
|
||||||
#define ARCH_PERF_COUNTER_START ARMV8_IDX_COUNTER0
|
#define ARCH_PERF_COUNTER_START ARMV8_IDX_COUNTER0
|
||||||
|
|
||||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
|
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
|
||||||
|
|||||||
@@ -2,9 +2,13 @@
|
|||||||
#include <linux/version.h>
|
#include <linux/version.h>
|
||||||
#include <linux/kallsyms.h>
|
#include <linux/kallsyms.h>
|
||||||
#include <linux/uaccess.h>
|
#include <linux/uaccess.h>
|
||||||
|
#include <asm/vsyscall.h>
|
||||||
|
#include <asm/vgtod.h>
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
#include "../../mcctrl.h"
|
#include "../../mcctrl.h"
|
||||||
|
|
||||||
|
#define gtod (&VVAR(vsyscall_gtod_data))
|
||||||
|
|
||||||
//#define SC_DEBUG
|
//#define SC_DEBUG
|
||||||
|
|
||||||
#ifdef SC_DEBUG
|
#ifdef SC_DEBUG
|
||||||
@@ -54,7 +58,6 @@ int arch_symbols_init(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_52
|
|
||||||
#define VDSO_MAXPAGES 2
|
#define VDSO_MAXPAGES 2
|
||||||
struct vdso {
|
struct vdso {
|
||||||
long busy;
|
long busy;
|
||||||
@@ -70,8 +73,8 @@ struct vdso {
|
|||||||
long hpet_phys;
|
long hpet_phys;
|
||||||
void *pvti_virt;
|
void *pvti_virt;
|
||||||
long pvti_phys;
|
long pvti_phys;
|
||||||
|
void *vgtod_virt;
|
||||||
};
|
};
|
||||||
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
|
|
||||||
|
|
||||||
unsigned long
|
unsigned long
|
||||||
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end);
|
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end);
|
||||||
@@ -207,6 +210,7 @@ void get_vdso_info(ihk_os_t os, long vdso_rpa)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vdso->vgtod_virt = (void *)gtod;
|
||||||
out:
|
out:
|
||||||
wmb();
|
wmb();
|
||||||
vdso->busy = 0;
|
vdso->busy = 0;
|
||||||
@@ -257,25 +261,35 @@ struct trans_uctx {
|
|||||||
};
|
};
|
||||||
|
|
||||||
void
|
void
|
||||||
restore_fs(unsigned long fs)
|
restore_tls(unsigned long addr)
|
||||||
{
|
{
|
||||||
wrmsrl(MSR_FS_BASE, fs);
|
wrmsrl(MSR_FS_BASE, addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
save_fs_ctx(void *ctx)
|
save_tls_ctx(void __user *ctx)
|
||||||
{
|
{
|
||||||
struct trans_uctx *tctx = ctx;
|
struct trans_uctx __user *tctx = ctx;
|
||||||
|
struct trans_uctx kctx;
|
||||||
|
|
||||||
rdmsrl(MSR_FS_BASE, tctx->fs);
|
if (copy_from_user(&kctx, tctx, sizeof(struct trans_uctx))) {
|
||||||
|
pr_err("%s: copy_from_user failed.\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
rdmsrl(MSR_FS_BASE, kctx.fs);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned long
|
unsigned long
|
||||||
get_fs_ctx(void *ctx)
|
get_tls_ctx(void __user *ctx)
|
||||||
{
|
{
|
||||||
struct trans_uctx *tctx = ctx;
|
struct trans_uctx __user *tctx = ctx;
|
||||||
|
struct trans_uctx kctx;
|
||||||
|
|
||||||
return tctx->fs;
|
if (copy_from_user(&kctx, tctx, sizeof(struct trans_uctx))) {
|
||||||
|
pr_err("%s: copy_from_user failed.\n", __func__);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return kctx.fs;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned long
|
unsigned long
|
||||||
@@ -356,11 +370,17 @@ out:
|
|||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_12
|
|
||||||
#define PFN_WRITE_COMBINED _PAGE_PWT
|
#define PFN_WRITE_COMBINED _PAGE_PWT
|
||||||
static inline bool pte_is_write_combined(pte_t pte)
|
static inline bool pte_is_write_combined(pte_t pte)
|
||||||
{
|
{
|
||||||
return ((pte_flags(pte) & _PAGE_PWT) && !(pte_flags(pte) & _PAGE_PCD));
|
return ((pte_flags(pte) & _PAGE_PWT) && !(pte_flags(pte) & _PAGE_PCD));
|
||||||
}
|
}
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The assembler switch_ctx is save/load registers in the context.
|
||||||
|
* Do TLS save/load and register host_thread with ioctl.
|
||||||
|
*/
|
||||||
|
long arch_switch_ctx(struct uti_switch_ctx_desc *desc)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|||||||
@@ -9,14 +9,12 @@
|
|||||||
extern int translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long rva,
|
extern int translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long rva,
|
||||||
unsigned long *rpap, unsigned long *pgsizep);
|
unsigned long *rpap, unsigned long *pgsizep);
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_12
|
|
||||||
#define PFN_WRITE_COMBINED _PAGE_PWT
|
#define PFN_WRITE_COMBINED _PAGE_PWT
|
||||||
|
|
||||||
static inline bool pte_is_write_combined(pte_t pte)
|
static inline bool pte_is_write_combined(pte_t pte)
|
||||||
{
|
{
|
||||||
return ((pte_flags(pte) & _PAGE_PWT) && !(pte_flags(pte) & _PAGE_PCD));
|
return ((pte_flags(pte) & _PAGE_PWT) && !(pte_flags(pte) & _PAGE_PCD));
|
||||||
}
|
}
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
|
|
||||||
|
|
||||||
#define ARCH_PERF_COUNTER_START 0
|
#define ARCH_PERF_COUNTER_START 0
|
||||||
|
|
||||||
|
|||||||
@@ -44,7 +44,6 @@
|
|||||||
#include <config.h>
|
#include <config.h>
|
||||||
#include "mcctrl.h"
|
#include "mcctrl.h"
|
||||||
#include <ihk/ihk_host_user.h>
|
#include <ihk/ihk_host_user.h>
|
||||||
#include <ihklib_rusage.h>
|
|
||||||
#include <rusage.h>
|
#include <rusage.h>
|
||||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
|
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
|
||||||
#include <uapi/linux/sched/types.h>
|
#include <uapi/linux/sched/types.h>
|
||||||
@@ -87,8 +86,19 @@ int syscall_backward(struct mcctrl_usrdata *, int, unsigned long, unsigned long,
|
|||||||
unsigned long, unsigned long, unsigned long,
|
unsigned long, unsigned long, unsigned long,
|
||||||
unsigned long, unsigned long *);
|
unsigned long, unsigned long *);
|
||||||
|
|
||||||
|
struct mcos_handler_info {
|
||||||
|
int pid;
|
||||||
|
int cpu;
|
||||||
|
struct mcctrl_usrdata *ud;
|
||||||
|
struct file *file;
|
||||||
|
unsigned long user_start;
|
||||||
|
unsigned long user_end;
|
||||||
|
unsigned long prepare_thread;
|
||||||
|
};
|
||||||
|
|
||||||
static long mcexec_prepare_image(ihk_os_t os,
|
static long mcexec_prepare_image(ihk_os_t os,
|
||||||
struct program_load_desc * __user udesc)
|
struct program_load_desc * __user udesc,
|
||||||
|
struct file *file)
|
||||||
{
|
{
|
||||||
struct program_load_desc *desc = NULL;
|
struct program_load_desc *desc = NULL;
|
||||||
struct program_load_desc *pdesc = NULL;
|
struct program_load_desc *pdesc = NULL;
|
||||||
@@ -100,6 +110,7 @@ static long mcexec_prepare_image(ihk_os_t os,
|
|||||||
struct mcctrl_per_proc_data *ppd = NULL;
|
struct mcctrl_per_proc_data *ppd = NULL;
|
||||||
int num_sections;
|
int num_sections;
|
||||||
int free_ikc_pointers = 1;
|
int free_ikc_pointers = 1;
|
||||||
|
struct mcos_handler_info *info;
|
||||||
|
|
||||||
if (!usrdata) {
|
if (!usrdata) {
|
||||||
pr_err("%s: error: mcctrl_usrdata not found\n", __func__);
|
pr_err("%s: error: mcctrl_usrdata not found\n", __func__);
|
||||||
@@ -122,6 +133,14 @@ static long mcexec_prepare_image(ihk_os_t os,
|
|||||||
goto free_out;
|
goto free_out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
info = ihk_os_get_mcos_private_data(file);
|
||||||
|
if (!info) {
|
||||||
|
ret = -EFAULT;
|
||||||
|
goto free_out;
|
||||||
|
}
|
||||||
|
/* To serialize SCD_MSG_SCHEDULE_PROCESS and SCD_MSG_CLEANUP_PROCESS */
|
||||||
|
info->cpu = desc->cpu;
|
||||||
|
|
||||||
ppd = mcctrl_get_per_proc_data(usrdata, desc->pid);
|
ppd = mcctrl_get_per_proc_data(usrdata, desc->pid);
|
||||||
if (!ppd) {
|
if (!ppd) {
|
||||||
printk("%s: ERROR: no per process data for PID %d\n",
|
printk("%s: ERROR: no per process data for PID %d\n",
|
||||||
@@ -193,6 +212,11 @@ static long mcexec_prepare_image(ihk_os_t os,
|
|||||||
/* either send or remote prepare_process failed */
|
/* either send or remote prepare_process failed */
|
||||||
goto put_and_free_out;
|
goto put_and_free_out;
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
* Used as SCD_MSG_CLEANUP_PROCESS target which isn't scheduled
|
||||||
|
* with SCD_MSG_SCHEDULE_PROCESS
|
||||||
|
*/
|
||||||
|
info->prepare_thread = pdesc->rprocess;
|
||||||
|
|
||||||
/* Update rpgtable */
|
/* Update rpgtable */
|
||||||
ppd->rpgtable = pdesc->rpgtable;
|
ppd->rpgtable = pdesc->rpgtable;
|
||||||
@@ -307,30 +331,10 @@ int mcexec_transfer_image(ihk_os_t os, struct remote_transfer *__user upt)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
struct mcos_handler_info {
|
|
||||||
int pid;
|
|
||||||
int cpu;
|
|
||||||
struct mcctrl_usrdata *ud;
|
|
||||||
struct file *file;
|
|
||||||
unsigned long user_start;
|
|
||||||
unsigned long user_end;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct mcos_handler_info;
|
struct mcos_handler_info;
|
||||||
static LIST_HEAD(host_threads); /* Used for FS switch */
|
static LIST_HEAD(host_threads); /* Used for FS switch */
|
||||||
DEFINE_RWLOCK(host_thread_lock);
|
DEFINE_RWLOCK(host_thread_lock);
|
||||||
|
|
||||||
/* Info of Linux counterpart of migrated-to-Linux thread */
|
|
||||||
struct host_thread {
|
|
||||||
struct list_head list;
|
|
||||||
struct mcos_handler_info *handler;
|
|
||||||
int pid;
|
|
||||||
int tid;
|
|
||||||
unsigned long usp;
|
|
||||||
unsigned long lfs;
|
|
||||||
unsigned long rfs;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct mcos_handler_info *new_mcos_handler_info(ihk_os_t os, struct file *file)
|
struct mcos_handler_info *new_mcos_handler_info(ihk_os_t os, struct file *file)
|
||||||
{
|
{
|
||||||
struct mcos_handler_info *info;
|
struct mcos_handler_info *info;
|
||||||
@@ -391,6 +395,7 @@ static void release_handler(ihk_os_t os, void *param)
|
|||||||
memset(&isp, '\0', sizeof isp);
|
memset(&isp, '\0', sizeof isp);
|
||||||
isp.msg = SCD_MSG_CLEANUP_PROCESS;
|
isp.msg = SCD_MSG_CLEANUP_PROCESS;
|
||||||
isp.pid = info->pid;
|
isp.pid = info->pid;
|
||||||
|
isp.arg = info->prepare_thread;
|
||||||
|
|
||||||
dprintk("%s: SCD_MSG_CLEANUP_PROCESS, info: %p, cpu: %d\n",
|
dprintk("%s: SCD_MSG_CLEANUP_PROCESS, info: %p, cpu: %d\n",
|
||||||
__FUNCTION__, info, info->cpu);
|
__FUNCTION__, info, info->cpu);
|
||||||
@@ -426,6 +431,7 @@ static long mcexec_start_image(ihk_os_t os,
|
|||||||
struct mcctrl_channel *c;
|
struct mcctrl_channel *c;
|
||||||
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
|
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
|
||||||
struct mcos_handler_info *info;
|
struct mcos_handler_info *info;
|
||||||
|
struct mcos_handler_info *prev_info;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
if (!usrdata) {
|
if (!usrdata) {
|
||||||
@@ -446,6 +452,7 @@ static long mcexec_start_image(ihk_os_t os,
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
prev_info = ihk_os_get_mcos_private_data(file);
|
||||||
info = new_mcos_handler_info(os, file);
|
info = new_mcos_handler_info(os, file);
|
||||||
if (info == NULL) {
|
if (info == NULL) {
|
||||||
ret = -ENOMEM;
|
ret = -ENOMEM;
|
||||||
@@ -456,6 +463,7 @@ static long mcexec_start_image(ihk_os_t os,
|
|||||||
info->cpu = desc->cpu;
|
info->cpu = desc->cpu;
|
||||||
info->user_start = desc->user_start;
|
info->user_start = desc->user_start;
|
||||||
info->user_end = desc->user_end;
|
info->user_end = desc->user_end;
|
||||||
|
info->prepare_thread = prev_info->prepare_thread;
|
||||||
ihk_os_register_release_handler(file, release_handler, info);
|
ihk_os_register_release_handler(file, release_handler, info);
|
||||||
ihk_os_set_mcos_private_data(file, info);
|
ihk_os_set_mcos_private_data(file, info);
|
||||||
|
|
||||||
@@ -472,8 +480,10 @@ static long mcexec_start_image(ihk_os_t os,
|
|||||||
ret = mcctrl_ikc_send(os, desc->cpu, &isp);
|
ret = mcctrl_ikc_send(os, desc->cpu, &isp);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
printk("%s: error: sending IKC msg\n", __FUNCTION__);
|
printk("%s: error: sending IKC msg\n", __FUNCTION__);
|
||||||
|
goto out;
|
||||||
}
|
}
|
||||||
|
/* clear prepared thread struct */
|
||||||
|
info->prepare_thread = 0;
|
||||||
out:
|
out:
|
||||||
kfree(desc);
|
kfree(desc);
|
||||||
return ret;
|
return ret;
|
||||||
@@ -577,17 +587,14 @@ extern int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id);
|
|||||||
static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
|
static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
|
||||||
{
|
{
|
||||||
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
|
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
|
||||||
struct mcctrl_part_exec *pe;
|
struct mcctrl_part_exec *pe = NULL, *pe_itr;
|
||||||
struct get_cpu_set_arg req;
|
struct get_cpu_set_arg req;
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
|
||||||
struct mcctrl_cpu_topology *cpu_top, *cpu_top_i;
|
struct mcctrl_cpu_topology *cpu_top, *cpu_top_i;
|
||||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
struct cpu_topology *cpu_top, *cpu_top_i;
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
struct cache_topology *cache_top;
|
struct cache_topology *cache_top;
|
||||||
int cpu, cpus_assigned, cpus_to_assign, cpu_prev;
|
int cpu, cpus_assigned, cpus_to_assign, cpu_prev;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
int mcexec_linux_numa;
|
int mcexec_linux_numa;
|
||||||
|
int pe_list_len = 0;
|
||||||
cpumask_t *mcexec_cpu_set = NULL;
|
cpumask_t *mcexec_cpu_set = NULL;
|
||||||
cpumask_t *cpus_used = NULL;
|
cpumask_t *cpus_used = NULL;
|
||||||
cpumask_t *cpus_to_use = NULL;
|
cpumask_t *cpus_to_use = NULL;
|
||||||
@@ -607,24 +614,54 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
|
|||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
pe = &udp->part_exec;
|
|
||||||
|
|
||||||
mutex_lock(&pe->lock);
|
|
||||||
|
|
||||||
if (copy_from_user(&req, (void *)arg, sizeof(req))) {
|
if (copy_from_user(&req, (void *)arg, sizeof(req))) {
|
||||||
printk("%s: error copying user request\n", __FUNCTION__);
|
pr_err("%s: error copying user request\n", __func__);
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
goto put_and_unlock_out;
|
goto put_out;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* First process to enter CPU partitioning */
|
mutex_lock(&udp->part_exec_lock);
|
||||||
if (pe->nr_processes == -1) {
|
/* Find part_exec having same node_proxy */
|
||||||
|
list_for_each_entry_reverse(pe_itr, &udp->part_exec_list, chain) {
|
||||||
|
pe_list_len++;
|
||||||
|
if (pe_itr->node_proxy_pid == req.ppid) {
|
||||||
|
pe = pe_itr;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!pe) {
|
||||||
|
/* First process to enter CPU partitioning */
|
||||||
|
pr_debug("%s: pe_list_len:%d\n", __func__, pe_list_len);
|
||||||
|
if (pe_list_len >= PE_LIST_MAXLEN) {
|
||||||
|
/* delete head entry of pe_list */
|
||||||
|
pe_itr = list_first_entry(&udp->part_exec_list,
|
||||||
|
struct mcctrl_part_exec, chain);
|
||||||
|
list_del(&pe_itr->chain);
|
||||||
|
kfree(pe_itr);
|
||||||
|
}
|
||||||
|
|
||||||
|
pe = kzalloc(sizeof(struct mcctrl_part_exec), GFP_KERNEL);
|
||||||
|
if (!pe) {
|
||||||
|
mutex_unlock(&udp->part_exec_lock);
|
||||||
|
ret = -ENOMEM;
|
||||||
|
goto put_out;
|
||||||
|
}
|
||||||
|
/* Init part_exec */
|
||||||
|
mutex_init(&pe->lock);
|
||||||
|
INIT_LIST_HEAD(&pe->pli_list);
|
||||||
pe->nr_processes = req.nr_processes;
|
pe->nr_processes = req.nr_processes;
|
||||||
pe->nr_processes_left = req.nr_processes;
|
pe->nr_processes_left = req.nr_processes;
|
||||||
|
pe->nr_processes_joined = 0;
|
||||||
|
pe->node_proxy_pid = req.ppid;
|
||||||
|
|
||||||
|
list_add_tail(&pe->chain, &udp->part_exec_list);
|
||||||
dprintk("%s: nr_processes: %d (partitioned exec starts)\n",
|
dprintk("%s: nr_processes: %d (partitioned exec starts)\n",
|
||||||
__FUNCTION__,
|
__func__, pe->nr_processes);
|
||||||
pe->nr_processes);
|
|
||||||
}
|
}
|
||||||
|
mutex_unlock(&udp->part_exec_lock);
|
||||||
|
|
||||||
|
mutex_lock(&pe->lock);
|
||||||
|
|
||||||
if (pe->nr_processes != req.nr_processes) {
|
if (pe->nr_processes != req.nr_processes) {
|
||||||
printk("%s: error: requested number of processes"
|
printk("%s: error: requested number of processes"
|
||||||
@@ -634,7 +671,15 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
|
|||||||
goto put_and_unlock_out;
|
goto put_and_unlock_out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (pe->nr_processes_joined >= pe->nr_processes) {
|
||||||
|
printk("%s: too many processes have joined to the group of %d\n",
|
||||||
|
__func__, req.ppid);
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto put_and_unlock_out;
|
||||||
|
}
|
||||||
|
|
||||||
--pe->nr_processes_left;
|
--pe->nr_processes_left;
|
||||||
|
++pe->nr_processes_joined;
|
||||||
dprintk("%s: nr_processes: %d, nr_processes_left: %d\n",
|
dprintk("%s: nr_processes: %d, nr_processes_left: %d\n",
|
||||||
__FUNCTION__,
|
__FUNCTION__,
|
||||||
pe->nr_processes,
|
pe->nr_processes,
|
||||||
@@ -720,8 +765,6 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
|
|||||||
wake_up_interruptible(&pli_next->pli_wq);
|
wake_up_interruptible(&pli_next->pli_wq);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Reset process counter to start state */
|
|
||||||
pe->nr_processes = -1;
|
|
||||||
ret = -ETIMEDOUT;
|
ret = -ETIMEDOUT;
|
||||||
goto put_and_unlock_out;
|
goto put_and_unlock_out;
|
||||||
}
|
}
|
||||||
@@ -969,16 +1012,8 @@ next_cpu:
|
|||||||
/* Commit used cores to OS structure */
|
/* Commit used cores to OS structure */
|
||||||
memcpy(&pe->cpus_used, cpus_used, sizeof(*cpus_used));
|
memcpy(&pe->cpus_used, cpus_used, sizeof(*cpus_used));
|
||||||
|
|
||||||
/* Reset if last process */
|
/* If not last process, wake up next process in list */
|
||||||
if (pe->nr_processes_left == 0) {
|
if (pe->nr_processes_left != 0) {
|
||||||
dprintk("%s: nr_processes: %d (partitioned exec ends)\n",
|
|
||||||
__FUNCTION__,
|
|
||||||
pe->nr_processes);
|
|
||||||
pe->nr_processes = -1;
|
|
||||||
memset(&pe->cpus_used, 0, sizeof(pe->cpus_used));
|
|
||||||
}
|
|
||||||
/* Otherwise wake up next process in list */
|
|
||||||
else {
|
|
||||||
++pe->process_rank;
|
++pe->process_rank;
|
||||||
pli_next = list_first_entry(&pe->pli_list,
|
pli_next = list_first_entry(&pe->pli_list,
|
||||||
struct process_list_item, list);
|
struct process_list_item, list);
|
||||||
@@ -991,11 +1026,14 @@ next_cpu:
|
|||||||
ret = 0;
|
ret = 0;
|
||||||
|
|
||||||
put_and_unlock_out:
|
put_and_unlock_out:
|
||||||
|
mutex_unlock(&pe->lock);
|
||||||
|
|
||||||
|
put_out:
|
||||||
|
mcctrl_put_per_proc_data(ppd);
|
||||||
|
|
||||||
kfree(cpus_to_use);
|
kfree(cpus_to_use);
|
||||||
kfree(cpus_used);
|
kfree(cpus_used);
|
||||||
kfree(mcexec_cpu_set);
|
kfree(mcexec_cpu_set);
|
||||||
mcctrl_put_per_proc_data(ppd);
|
|
||||||
mutex_unlock(&pe->lock);
|
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@@ -2376,7 +2414,7 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
|
|||||||
{
|
{
|
||||||
struct mcctrl_ioctl_getrusage_desc desc;
|
struct mcctrl_ioctl_getrusage_desc desc;
|
||||||
struct rusage_global *rusage_global = ihk_os_get_rusage(ihk_os);
|
struct rusage_global *rusage_global = ihk_os_get_rusage(ihk_os);
|
||||||
struct mckernel_rusage *rusage = NULL;
|
struct ihk_os_rusage *rusage = NULL;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
int i;
|
int i;
|
||||||
unsigned long ut;
|
unsigned long ut;
|
||||||
@@ -2388,13 +2426,13 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
rusage = kmalloc(sizeof(struct mckernel_rusage), GFP_KERNEL);
|
rusage = kmalloc(sizeof(struct ihk_os_rusage), GFP_KERNEL);
|
||||||
if (!rusage) {
|
if (!rusage) {
|
||||||
printk("%s: kmalloc failed\n", __FUNCTION__);
|
printk("%s: kmalloc failed\n", __FUNCTION__);
|
||||||
ret = -ENOMEM;
|
ret = -ENOMEM;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
memset(rusage, 0, sizeof(struct mckernel_rusage));
|
memset(rusage, 0, sizeof(struct ihk_os_rusage));
|
||||||
|
|
||||||
/* Compile statistics */
|
/* Compile statistics */
|
||||||
for (i = 0; i < IHK_MAX_NUM_PGSIZES; i++) {
|
for (i = 0; i < IHK_MAX_NUM_PGSIZES; i++) {
|
||||||
@@ -2415,15 +2453,17 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
|
|||||||
st += rusage_global->cpu[i].system_tsc * rusage_global->ns_per_tsc / 1000;
|
st += rusage_global->cpu[i].system_tsc * rusage_global->ns_per_tsc / 1000;
|
||||||
rusage->cpuacct_usage_percpu[i] = wt;
|
rusage->cpuacct_usage_percpu[i] = wt;
|
||||||
}
|
}
|
||||||
rusage->cpuacct_stat_system = st / 10000000;
|
rusage->cpuacct_stat_system = (st + 10000000 - 1) / 10000000;
|
||||||
rusage->cpuacct_stat_user = ut / 10000000;
|
rusage->cpuacct_stat_user = (ut + 10000000 - 1) / 10000000;
|
||||||
rusage->cpuacct_usage = ut;
|
rusage->cpuacct_usage = ut;
|
||||||
|
|
||||||
rusage->num_threads = rusage_global->num_threads;
|
rusage->num_threads = rusage_global->num_threads;
|
||||||
rusage->max_num_threads = rusage_global->max_num_threads;
|
rusage->max_num_threads = rusage_global->max_num_threads;
|
||||||
|
|
||||||
if (desc.size_rusage > sizeof(struct mckernel_rusage)) {
|
if (desc.size_rusage > sizeof(struct ihk_os_rusage)) {
|
||||||
printk("%s: desc.size_rusage=%ld > sizeof(struct mckernel_rusage)=%ld\n", __FUNCTION__, desc.size_rusage, sizeof(struct mckernel_rusage));
|
printk("%s: desc.size_rusage=%ld > sizeof(struct mckernel_rusage)=%ld\n",
|
||||||
|
__func__, desc.size_rusage,
|
||||||
|
sizeof(struct ihk_os_rusage));
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
@@ -2444,10 +2484,10 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
|
|||||||
|
|
||||||
extern void *get_user_sp(void);
|
extern void *get_user_sp(void);
|
||||||
extern void set_user_sp(unsigned long);
|
extern void set_user_sp(unsigned long);
|
||||||
extern void restore_fs(unsigned long fs);
|
extern void restore_tls(unsigned long addr);
|
||||||
extern void save_fs_ctx(void *);
|
extern void save_tls_ctx(void __user *ctx);
|
||||||
extern unsigned long get_fs_ctx(void *);
|
extern unsigned long get_tls_ctx(void __user *ctx);
|
||||||
extern unsigned long get_rsp_ctx(void *);
|
extern unsigned long get_rsp_ctx(void *ctx);
|
||||||
|
|
||||||
long mcexec_uti_get_ctx(ihk_os_t os, struct uti_get_ctx_desc __user *udesc)
|
long mcexec_uti_get_ctx(ihk_os_t os, struct uti_get_ctx_desc __user *udesc)
|
||||||
{
|
{
|
||||||
@@ -2491,14 +2531,15 @@ long mcexec_uti_get_ctx(ihk_os_t os, struct uti_get_ctx_desc __user *udesc)
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
long mcexec_uti_save_fs(ihk_os_t os, struct uti_save_fs_desc __user *udesc, struct file *file)
|
long mcctrl_switch_ctx(ihk_os_t os, struct uti_switch_ctx_desc __user *udesc,
|
||||||
|
struct file *file)
|
||||||
{
|
{
|
||||||
int rc = 0;
|
int rc = 0;
|
||||||
void *usp = get_user_sp();
|
void *usp = get_user_sp();
|
||||||
struct mcos_handler_info *info;
|
struct mcos_handler_info *info;
|
||||||
struct host_thread *thread;
|
struct host_thread *thread;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
struct uti_save_fs_desc desc;
|
struct uti_switch_ctx_desc desc;
|
||||||
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
|
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
|
||||||
struct mcctrl_per_proc_data *ppd;
|
struct mcctrl_per_proc_data *ppd;
|
||||||
|
|
||||||
@@ -2508,21 +2549,26 @@ long mcexec_uti_save_fs(ihk_os_t os, struct uti_save_fs_desc __user *udesc, stru
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(copy_from_user(&desc, udesc, sizeof(struct uti_save_fs_desc))) {
|
if (copy_from_user(&desc, udesc, sizeof(struct uti_switch_ctx_desc))) {
|
||||||
printk("%s: Error: copy_from_user failed\n", __FUNCTION__);
|
printk("%s: Error: copy_from_user failed\n", __FUNCTION__);
|
||||||
rc = -EFAULT;
|
rc = -EFAULT;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
save_fs_ctx(desc.lctx);
|
rc = arch_switch_ctx(&desc);
|
||||||
|
if (rc < 0) {
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
save_tls_ctx(desc.lctx);
|
||||||
info = ihk_os_get_mcos_private_data(file);
|
info = ihk_os_get_mcos_private_data(file);
|
||||||
thread = kmalloc(sizeof(struct host_thread), GFP_KERNEL);
|
thread = kmalloc(sizeof(struct host_thread), GFP_KERNEL);
|
||||||
memset(thread, '\0', sizeof(struct host_thread));
|
memset(thread, '\0', sizeof(struct host_thread));
|
||||||
thread->pid = task_tgid_vnr(current);
|
thread->pid = task_tgid_vnr(current);
|
||||||
thread->tid = task_pid_vnr(current);
|
thread->tid = task_pid_vnr(current);
|
||||||
thread->usp = (unsigned long)usp;
|
thread->usp = (unsigned long)usp;
|
||||||
thread->lfs = get_fs_ctx(desc.lctx);
|
thread->ltls = get_tls_ctx(desc.lctx);
|
||||||
thread->rfs = get_fs_ctx(desc.rctx);
|
thread->rtls = get_tls_ctx(desc.rctx);
|
||||||
thread->handler = info;
|
thread->handler = info;
|
||||||
|
|
||||||
write_lock_irqsave(&host_thread_lock, flags);
|
write_lock_irqsave(&host_thread_lock, flags);
|
||||||
@@ -2568,9 +2614,9 @@ mcexec_sig_thread(ihk_os_t os, unsigned long arg, struct file *file)
|
|||||||
read_unlock_irqrestore(&host_thread_lock, flags);
|
read_unlock_irqrestore(&host_thread_lock, flags);
|
||||||
if (thread) {
|
if (thread) {
|
||||||
if (arg)
|
if (arg)
|
||||||
restore_fs(thread->lfs);
|
restore_tls(thread->ltls);
|
||||||
else
|
else
|
||||||
restore_fs(thread->rfs);
|
restore_tls(thread->rtls);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
@@ -2774,8 +2820,7 @@ long mcexec_syscall_thread(ihk_os_t os, unsigned long arg, struct file *file)
|
|||||||
return -EFAULT;
|
return -EFAULT;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* debug */
|
if (param.number == __NR_futex) {
|
||||||
if (0 && param.number == __NR_futex) {
|
|
||||||
struct uti_futex_resp resp = {
|
struct uti_futex_resp resp = {
|
||||||
.done = 0
|
.done = 0
|
||||||
};
|
};
|
||||||
@@ -2971,13 +3016,8 @@ mcexec_uti_attr(ihk_os_t os, struct uti_attr_desc __user *_desc)
|
|||||||
cpumask_t *cpuset = NULL, *env_cpuset = NULL;
|
cpumask_t *cpuset = NULL, *env_cpuset = NULL;
|
||||||
struct mcctrl_usrdata *ud = ihk_host_os_get_usrdata(os);
|
struct mcctrl_usrdata *ud = ihk_host_os_get_usrdata(os);
|
||||||
ihk_device_t dev = ihk_os_to_dev(os);
|
ihk_device_t dev = ihk_os_to_dev(os);
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
|
||||||
struct mcctrl_cpu_topology *cpu_topo;
|
struct mcctrl_cpu_topology *cpu_topo;
|
||||||
struct mcctrl_cpu_topology *target_cpu = NULL;
|
struct mcctrl_cpu_topology *target_cpu = NULL;
|
||||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
struct cpu_topology *cpu_topo;
|
|
||||||
struct cpu_topology *target_cpu = NULL;
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
struct node_topology *node_topo;
|
struct node_topology *node_topo;
|
||||||
struct ihk_cache_topology *lcache_topo;
|
struct ihk_cache_topology *lcache_topo;
|
||||||
struct ihk_node_topology *lnode_topo;
|
struct ihk_node_topology *lnode_topo;
|
||||||
@@ -3200,13 +3240,51 @@ out:
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int __mcctrl_control_perm(unsigned int request)
|
||||||
|
{
|
||||||
|
int ret = 0;
|
||||||
|
kuid_t euid;
|
||||||
|
|
||||||
|
/* black list */
|
||||||
|
switch (request) {
|
||||||
|
case IHK_OS_AUX_PERF_NUM:
|
||||||
|
case IHK_OS_AUX_PERF_SET:
|
||||||
|
case IHK_OS_AUX_PERF_GET:
|
||||||
|
case IHK_OS_AUX_PERF_ENABLE:
|
||||||
|
case IHK_OS_AUX_PERF_DISABLE:
|
||||||
|
case IHK_OS_AUX_PERF_DESTROY:
|
||||||
|
euid = current_euid();
|
||||||
|
pr_debug("%s: request=0x%x, euid=%u\n",
|
||||||
|
__func__, request, euid.val);
|
||||||
|
if (euid.val) {
|
||||||
|
ret = -EPERM;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
pr_debug("%s: request=0x%x, ret=%d\n", __func__, request, ret);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
|
long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
|
||||||
struct file *file)
|
struct file *file)
|
||||||
{
|
{
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = __mcctrl_control_perm(req);
|
||||||
|
if (ret) {
|
||||||
|
pr_err("%s: error: permission denied, req: %x\n",
|
||||||
|
__func__, req);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
switch (req) {
|
switch (req) {
|
||||||
case MCEXEC_UP_PREPARE_IMAGE:
|
case MCEXEC_UP_PREPARE_IMAGE:
|
||||||
return mcexec_prepare_image(os,
|
return mcexec_prepare_image(os,
|
||||||
(struct program_load_desc *)arg);
|
(struct program_load_desc *)arg,
|
||||||
|
file);
|
||||||
case MCEXEC_UP_TRANSFER:
|
case MCEXEC_UP_TRANSFER:
|
||||||
return mcexec_transfer_image(os, (struct remote_transfer *)arg);
|
return mcexec_transfer_image(os, (struct remote_transfer *)arg);
|
||||||
|
|
||||||
@@ -3272,8 +3350,9 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
|
|||||||
case MCEXEC_UP_UTI_GET_CTX:
|
case MCEXEC_UP_UTI_GET_CTX:
|
||||||
return mcexec_uti_get_ctx(os, (struct uti_get_ctx_desc *)arg);
|
return mcexec_uti_get_ctx(os, (struct uti_get_ctx_desc *)arg);
|
||||||
|
|
||||||
case MCEXEC_UP_UTI_SAVE_FS:
|
case MCEXEC_UP_UTI_SWITCH_CTX:
|
||||||
return mcexec_uti_save_fs(os, (struct uti_save_fs_desc *)arg, file);
|
return mcctrl_switch_ctx(os, (struct uti_switch_ctx_desc *)arg,
|
||||||
|
file);
|
||||||
|
|
||||||
case MCEXEC_UP_SIG_THREAD:
|
case MCEXEC_UP_SIG_THREAD:
|
||||||
return mcexec_sig_thread(os, arg, file);
|
return mcexec_sig_thread(os, arg, file);
|
||||||
@@ -3379,7 +3458,8 @@ int mcctrl_get_request_os_cpu(ihk_os_t os, int *ret_cpu)
|
|||||||
*ret_cpu = ch->send.queue->read_cpu;
|
*ret_cpu = ch->send.queue->read_cpu;
|
||||||
ret = 0;
|
ret = 0;
|
||||||
|
|
||||||
printk("%s: OS: %p, CPU: %d\n", __FUNCTION__, os, *ret_cpu);
|
pr_info("%s: OS: %lx, CPU: %d\n",
|
||||||
|
__func__, (unsigned long)os, *ret_cpu);
|
||||||
|
|
||||||
out_put_ppd:
|
out_put_ppd:
|
||||||
mcctrl_put_per_thread_data(ptd);
|
mcctrl_put_per_thread_data(ptd);
|
||||||
@@ -3412,10 +3492,25 @@ int __mcctrl_os_read_write_cpu_register(ihk_os_t os, int cpu,
|
|||||||
struct ihk_os_cpu_register *desc,
|
struct ihk_os_cpu_register *desc,
|
||||||
enum mcctrl_os_cpu_operation op)
|
enum mcctrl_os_cpu_operation op)
|
||||||
{
|
{
|
||||||
|
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
|
||||||
struct ikc_scd_packet isp;
|
struct ikc_scd_packet isp;
|
||||||
struct mcctrl_os_cpu_response resp;
|
struct mcctrl_os_cpu_response resp;
|
||||||
int ret = -EINVAL;
|
int ret = -EINVAL;
|
||||||
|
|
||||||
|
if (!udp) {
|
||||||
|
pr_err("%s: error: mcctrl_usrdata not found\n", __func__);
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cpu < 0 || cpu >= udp->cpu_info->n_cpus) {
|
||||||
|
pr_err("%s: error: cpu (%d) is out of range\n",
|
||||||
|
__func__, cpu);
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
memset(&isp, '\0', sizeof(struct ikc_scd_packet));
|
memset(&isp, '\0', sizeof(struct ikc_scd_packet));
|
||||||
isp.msg = SCD_MSG_CPU_RW_REG;
|
isp.msg = SCD_MSG_CPU_RW_REG;
|
||||||
isp.op = op;
|
isp.op = op;
|
||||||
@@ -3451,6 +3546,9 @@ int __mcctrl_os_read_write_cpu_register(ihk_os_t os, int cpu,
|
|||||||
desc->val = resp.val;
|
desc->val = resp.val;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Notify caller (for future async implementation) */
|
||||||
|
atomic_set(&desc->sync, 1);
|
||||||
|
|
||||||
dprintk("%s: MCCTRL_OS_CPU_%s_REGISTER: reg: 0x%lx, val: 0x%lx\n",
|
dprintk("%s: MCCTRL_OS_CPU_%s_REGISTER: reg: 0x%lx, val: 0x%lx\n",
|
||||||
__FUNCTION__,
|
__FUNCTION__,
|
||||||
(op == MCCTRL_OS_CPU_READ_REGISTER ? "READ" : "WRITE"),
|
(op == MCCTRL_OS_CPU_READ_REGISTER ? "READ" : "WRITE"),
|
||||||
|
|||||||
@@ -21,8 +21,10 @@
|
|||||||
* 2013/08/19 shirasawa mcexec forward signal to MIC process
|
* 2013/08/19 shirasawa mcexec forward signal to MIC process
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <linux/sched.h>
|
#include <linux/init.h>
|
||||||
#include <linux/module.h>
|
#include <linux/module.h>
|
||||||
|
#include <linux/kernel.h>
|
||||||
|
#include <linux/sched.h>
|
||||||
#include <linux/fs.h>
|
#include <linux/fs.h>
|
||||||
#include <linux/miscdevice.h>
|
#include <linux/miscdevice.h>
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
@@ -80,11 +82,13 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
|
|||||||
{ .request = MCEXEC_UP_CLOSE_EXEC, .func = mcctrl_ioctl },
|
{ .request = MCEXEC_UP_CLOSE_EXEC, .func = mcctrl_ioctl },
|
||||||
{ .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl },
|
{ .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl },
|
||||||
{ .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl },
|
{ .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl },
|
||||||
|
#ifdef MCEXEC_BIND_MOUNT
|
||||||
{ .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl },
|
{ .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl },
|
||||||
{ .request = MCEXEC_UP_SYS_UMOUNT, .func = mcctrl_ioctl },
|
{ .request = MCEXEC_UP_SYS_UMOUNT, .func = mcctrl_ioctl },
|
||||||
{ .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl },
|
{ .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl },
|
||||||
|
#endif // MCEXEC_BIND_MOUNT
|
||||||
{ .request = MCEXEC_UP_UTI_GET_CTX, .func = mcctrl_ioctl },
|
{ .request = MCEXEC_UP_UTI_GET_CTX, .func = mcctrl_ioctl },
|
||||||
{ .request = MCEXEC_UP_UTI_SAVE_FS, .func = mcctrl_ioctl },
|
{ .request = MCEXEC_UP_UTI_SWITCH_CTX, .func = mcctrl_ioctl },
|
||||||
{ .request = MCEXEC_UP_SIG_THREAD, .func = mcctrl_ioctl },
|
{ .request = MCEXEC_UP_SIG_THREAD, .func = mcctrl_ioctl },
|
||||||
{ .request = MCEXEC_UP_SYSCALL_THREAD, .func = mcctrl_ioctl },
|
{ .request = MCEXEC_UP_SYSCALL_THREAD, .func = mcctrl_ioctl },
|
||||||
{ .request = MCEXEC_UP_TERMINATE_THREAD, .func = mcctrl_ioctl },
|
{ .request = MCEXEC_UP_TERMINATE_THREAD, .func = mcctrl_ioctl },
|
||||||
@@ -170,14 +174,6 @@ error_cleanup_channels:
|
|||||||
int mcctrl_os_shutdown_notifier(int os_index)
|
int mcctrl_os_shutdown_notifier(int os_index)
|
||||||
{
|
{
|
||||||
if (os[os_index]) {
|
if (os[os_index]) {
|
||||||
/* Wait for os running */
|
|
||||||
if (ihk_os_wait_for_status(os[os_index], IHK_OS_STATUS_RUNNING, 0, 200) != 0) {
|
|
||||||
printk("IHK: OS does not become RUNNING in shutdown. Force shutdown.\n");
|
|
||||||
/* send nmi to force shutdown */
|
|
||||||
ihk_os_send_nmi(os[os_index], 3);
|
|
||||||
mdelay(200);
|
|
||||||
}
|
|
||||||
|
|
||||||
pager_cleanup();
|
pager_cleanup();
|
||||||
sysfsm_cleanup(os[os_index]);
|
sysfsm_cleanup(os[os_index]);
|
||||||
free_topology_info(os[os_index]);
|
free_topology_info(os[os_index]);
|
||||||
|
|||||||
@@ -208,6 +208,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
|
|||||||
case SCD_MSG_PERF_ACK:
|
case SCD_MSG_PERF_ACK:
|
||||||
case SCD_MSG_SEND_SIGNAL_ACK:
|
case SCD_MSG_SEND_SIGNAL_ACK:
|
||||||
case SCD_MSG_PROCFS_ANSWER:
|
case SCD_MSG_PROCFS_ANSWER:
|
||||||
|
case SCD_MSG_REMOTE_PAGE_FAULT_ANSWER:
|
||||||
mcctrl_wakeup_cb(__os, pisp);
|
mcctrl_wakeup_cb(__os, pisp);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@@ -515,6 +516,7 @@ int prepare_ikc_channels(ihk_os_t os)
|
|||||||
|
|
||||||
init_waitqueue_head(&usrdata->wq_procfs);
|
init_waitqueue_head(&usrdata->wq_procfs);
|
||||||
mutex_init(&usrdata->reserve_lock);
|
mutex_init(&usrdata->reserve_lock);
|
||||||
|
mutex_init(&usrdata->part_exec_lock);
|
||||||
|
|
||||||
for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) {
|
for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) {
|
||||||
INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]);
|
INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]);
|
||||||
@@ -523,10 +525,8 @@ int prepare_ikc_channels(ihk_os_t os)
|
|||||||
|
|
||||||
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
|
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
|
||||||
INIT_LIST_HEAD(&usrdata->node_topology_list);
|
INIT_LIST_HEAD(&usrdata->node_topology_list);
|
||||||
|
INIT_LIST_HEAD(&usrdata->part_exec_list);
|
||||||
|
|
||||||
mutex_init(&usrdata->part_exec.lock);
|
|
||||||
INIT_LIST_HEAD(&usrdata->part_exec.pli_list);
|
|
||||||
usrdata->part_exec.nr_processes = -1;
|
|
||||||
INIT_LIST_HEAD(&usrdata->wakeup_descs_list);
|
INIT_LIST_HEAD(&usrdata->wakeup_descs_list);
|
||||||
spin_lock_init(&usrdata->wakeup_descs_lock);
|
spin_lock_init(&usrdata->wakeup_descs_lock);
|
||||||
|
|
||||||
@@ -582,6 +582,18 @@ void destroy_ikc_channels(ihk_os_t os)
|
|||||||
|
|
||||||
kfree(usrdata->channels);
|
kfree(usrdata->channels);
|
||||||
kfree(usrdata->ikc2linux);
|
kfree(usrdata->ikc2linux);
|
||||||
|
|
||||||
|
mutex_lock(&usrdata->part_exec_lock);
|
||||||
|
while (!list_empty(&usrdata->part_exec_list)) {
|
||||||
|
struct mcctrl_part_exec *pe;
|
||||||
|
|
||||||
|
pe = list_first_entry(&usrdata->part_exec_list,
|
||||||
|
struct mcctrl_part_exec, chain);
|
||||||
|
list_del(&pe->chain);
|
||||||
|
kfree(pe);
|
||||||
|
}
|
||||||
|
mutex_unlock(&usrdata->part_exec_lock);
|
||||||
|
|
||||||
kfree(usrdata);
|
kfree(usrdata);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -69,6 +69,9 @@
|
|||||||
#define SCD_MSG_PROCFS_ANSWER 0x13
|
#define SCD_MSG_PROCFS_ANSWER 0x13
|
||||||
#define SCD_MSG_PROCFS_RELEASE 0x15
|
#define SCD_MSG_PROCFS_RELEASE 0x15
|
||||||
|
|
||||||
|
#define SCD_MSG_REMOTE_PAGE_FAULT 0x18
|
||||||
|
#define SCD_MSG_REMOTE_PAGE_FAULT_ANSWER 0x19
|
||||||
|
|
||||||
#define SCD_MSG_DEBUG_LOG 0x20
|
#define SCD_MSG_DEBUG_LOG 0x20
|
||||||
|
|
||||||
#define SCD_MSG_SYSFS_REQ_CREATE 0x30
|
#define SCD_MSG_SYSFS_REQ_CREATE 0x30
|
||||||
@@ -121,12 +124,6 @@ enum mcctrl_os_cpu_operation {
|
|||||||
MCCTRL_OS_CPU_MAX_OP
|
MCCTRL_OS_CPU_MAX_OP
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Used to wake-up a Linux thread futex_wait()-ing */
|
|
||||||
struct uti_futex_resp {
|
|
||||||
int done;
|
|
||||||
wait_queue_head_t wq;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ikc_scd_packet {
|
struct ikc_scd_packet {
|
||||||
struct ihk_ikc_packet_header header;
|
struct ihk_ikc_packet_header header;
|
||||||
int msg;
|
int msg;
|
||||||
@@ -172,6 +169,14 @@ struct ikc_scd_packet {
|
|||||||
void *resp;
|
void *resp;
|
||||||
int *spin_sleep; /* 1: waiting in linux_wait_event() 0: woken up by someone else */
|
int *spin_sleep; /* 1: waiting in linux_wait_event() 0: woken up by someone else */
|
||||||
} futex;
|
} futex;
|
||||||
|
|
||||||
|
/* SCD_MSG_REMOTE_PAGE_FAULT */
|
||||||
|
struct {
|
||||||
|
int target_cpu;
|
||||||
|
int fault_tid;
|
||||||
|
unsigned long fault_address;
|
||||||
|
unsigned long fault_reason;
|
||||||
|
};
|
||||||
};
|
};
|
||||||
/* char padding[8]; */ /* We want the size to be 128 bytes */
|
/* char padding[8]; */ /* We want the size to be 128 bytes */
|
||||||
};
|
};
|
||||||
@@ -289,14 +294,11 @@ struct cache_topology {
|
|||||||
struct list_head chain;
|
struct list_head chain;
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
|
||||||
struct mcctrl_cpu_topology {
|
struct mcctrl_cpu_topology {
|
||||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
struct cpu_topology {
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
//struct mcctrl_usrdata *udp;
|
//struct mcctrl_usrdata *udp;
|
||||||
struct ihk_cpu_topology *saved;
|
struct ihk_cpu_topology *saved;
|
||||||
int mckernel_cpu_id;
|
int mckernel_cpu_id;
|
||||||
|
int mckernel_core_id;
|
||||||
cpumask_t core_siblings;
|
cpumask_t core_siblings;
|
||||||
cpumask_t thread_siblings;
|
cpumask_t thread_siblings;
|
||||||
|
|
||||||
@@ -323,13 +325,20 @@ struct process_list_item {
|
|||||||
wait_queue_head_t pli_wq;
|
wait_queue_head_t pli_wq;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define PE_LIST_MAXLEN 5
|
||||||
|
|
||||||
struct mcctrl_part_exec {
|
struct mcctrl_part_exec {
|
||||||
struct mutex lock;
|
struct mutex lock;
|
||||||
int nr_processes;
|
int nr_processes;
|
||||||
|
/* number of processes to let in / out the synchronization point */
|
||||||
int nr_processes_left;
|
int nr_processes_left;
|
||||||
|
/* number of processes which have joined the partition */
|
||||||
|
int nr_processes_joined;
|
||||||
int process_rank;
|
int process_rank;
|
||||||
|
pid_t node_proxy_pid;
|
||||||
cpumask_t cpus_used;
|
cpumask_t cpus_used;
|
||||||
struct list_head pli_list;
|
struct list_head pli_list;
|
||||||
|
struct list_head chain;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
|
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
|
||||||
@@ -352,6 +361,7 @@ struct mcctrl_usrdata {
|
|||||||
int job_pos;
|
int job_pos;
|
||||||
int mcctrl_dma_abort;
|
int mcctrl_dma_abort;
|
||||||
struct mutex reserve_lock;
|
struct mutex reserve_lock;
|
||||||
|
struct mutex part_exec_lock;
|
||||||
unsigned long last_thread_exec;
|
unsigned long last_thread_exec;
|
||||||
wait_queue_head_t wq_procfs;
|
wait_queue_head_t wq_procfs;
|
||||||
struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE];
|
struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE];
|
||||||
@@ -367,7 +377,7 @@ struct mcctrl_usrdata {
|
|||||||
nodemask_t numa_online;
|
nodemask_t numa_online;
|
||||||
struct list_head cpu_topology_list;
|
struct list_head cpu_topology_list;
|
||||||
struct list_head node_topology_list;
|
struct list_head node_topology_list;
|
||||||
struct mcctrl_part_exec part_exec;
|
struct list_head part_exec_list;
|
||||||
int perf_event_num;
|
int perf_event_num;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -448,40 +458,8 @@ void mcctrl_put_per_proc_data(struct mcctrl_per_proc_data *ppd);
|
|||||||
int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data *ppd, void *data);
|
int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data *ppd, void *data);
|
||||||
void mcctrl_put_per_thread_data_unsafe(struct mcctrl_per_thread_data *ptd);
|
void mcctrl_put_per_thread_data_unsafe(struct mcctrl_per_thread_data *ptd);
|
||||||
void mcctrl_put_per_thread_data(struct mcctrl_per_thread_data* ptd);
|
void mcctrl_put_per_thread_data(struct mcctrl_per_thread_data* ptd);
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_56 /* Strange how to use inline declaration fix. */
|
struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd,
|
||||||
static inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd,
|
struct task_struct *task);
|
||||||
struct task_struct *task)
|
|
||||||
{
|
|
||||||
struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL;
|
|
||||||
int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK);
|
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
/* Check if data for this thread exists */
|
|
||||||
write_lock_irqsave(&ppd->per_thread_data_hash_lock[hash], flags);
|
|
||||||
|
|
||||||
list_for_each_entry(ptd_iter, &ppd->per_thread_data_hash[hash], hash) {
|
|
||||||
if (ptd_iter->task == task) {
|
|
||||||
ptd = ptd_iter;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ptd) {
|
|
||||||
if (atomic_read(&ptd->refcount) <= 0) {
|
|
||||||
printk("%s: ERROR: use-after-free detected (%d)", __FUNCTION__, atomic_read(&ptd->refcount));
|
|
||||||
ptd = NULL;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
atomic_inc(&ptd->refcount);
|
|
||||||
}
|
|
||||||
|
|
||||||
out:
|
|
||||||
write_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags);
|
|
||||||
return ptd;
|
|
||||||
}
|
|
||||||
#else /* POSTK_DEBUG_ARCH_DEP_56 */
|
|
||||||
inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd, struct task_struct *task);
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_56 */
|
|
||||||
int mcctrl_clear_pte_range(uintptr_t start, uintptr_t len);
|
int mcctrl_clear_pte_range(uintptr_t start, uintptr_t len);
|
||||||
|
|
||||||
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
|
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
|
||||||
@@ -526,24 +504,6 @@ void reply_get_cpu_mapping(long req_pa);
|
|||||||
void free_topology_info(ihk_os_t os);
|
void free_topology_info(ihk_os_t os);
|
||||||
|
|
||||||
/* archdep.c */
|
/* archdep.c */
|
||||||
#ifndef POSTK_DEBUG_ARCH_DEP_52
|
|
||||||
#define VDSO_MAXPAGES 2
|
|
||||||
struct vdso {
|
|
||||||
long busy;
|
|
||||||
int vdso_npages;
|
|
||||||
char vvar_is_global;
|
|
||||||
char hpet_is_global;
|
|
||||||
char pvti_is_global;
|
|
||||||
char padding;
|
|
||||||
long vdso_physlist[VDSO_MAXPAGES];
|
|
||||||
void *vvar_virt;
|
|
||||||
long vvar_phys;
|
|
||||||
void *hpet_virt;
|
|
||||||
long hpet_phys;
|
|
||||||
void *pvti_virt;
|
|
||||||
long pvti_phys;
|
|
||||||
};
|
|
||||||
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
|
|
||||||
|
|
||||||
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp,
|
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp,
|
||||||
unsigned long *endp);
|
unsigned long *endp);
|
||||||
@@ -573,8 +533,28 @@ struct ihk_perf_event_attr{
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct mcctrl_ioctl_getrusage_desc {
|
struct mcctrl_ioctl_getrusage_desc {
|
||||||
void* rusage;
|
struct ihk_os_rusage *rusage;
|
||||||
size_t size_rusage;
|
size_t size_rusage;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* uti */
|
||||||
|
long mcctrl_switch_ctx(ihk_os_t os, struct uti_switch_ctx_desc __user *desc,
|
||||||
|
struct file *file);
|
||||||
|
long arch_switch_ctx(struct uti_switch_ctx_desc *desc);
|
||||||
|
|
||||||
|
struct host_thread {
|
||||||
|
struct list_head list;
|
||||||
|
struct mcos_handler_info *handler;
|
||||||
|
int pid;
|
||||||
|
int tid;
|
||||||
|
unsigned long usp;
|
||||||
|
unsigned long ltls;
|
||||||
|
unsigned long rtls;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Used to wake-up a Linux thread futex_wait()-ing */
|
||||||
|
struct uti_futex_resp {
|
||||||
|
int done;
|
||||||
|
wait_queue_head_t wq;
|
||||||
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -1110,10 +1110,10 @@ static const struct procfs_entry pid_entry_stuff[] = {
|
|||||||
// PROC_LNK("exe", mckernel_readlink),
|
// PROC_LNK("exe", mckernel_readlink),
|
||||||
// PROC_REG("limits", S_IRUSR|S_IWUSR, NULL),
|
// PROC_REG("limits", S_IRUSR|S_IWUSR, NULL),
|
||||||
PROC_REG("maps", 0444, &mckernel_buff_io),
|
PROC_REG("maps", 0444, &mckernel_buff_io),
|
||||||
PROC_REG("mem", 0400, NULL),
|
PROC_REG("mem", 0600, NULL),
|
||||||
PROC_REG("pagemap", 0444, NULL),
|
PROC_REG("pagemap", 0444, NULL),
|
||||||
// PROC_REG("smaps", S_IRUGO, NULL),
|
// PROC_REG("smaps", S_IRUGO, NULL),
|
||||||
// PROC_REG("stat", 0444, &mckernel_buff_io),
|
PROC_REG("stat", 0444, &mckernel_buff_io),
|
||||||
// PROC_REG("statm", S_IRUGO, NULL),
|
// PROC_REG("statm", S_IRUGO, NULL),
|
||||||
PROC_REG("status", 0444, &mckernel_buff_io),
|
PROC_REG("status", 0444, &mckernel_buff_io),
|
||||||
// PROC_REG("syscall", S_IRUGO, NULL),
|
// PROC_REG("syscall", S_IRUGO, NULL),
|
||||||
@@ -1130,6 +1130,7 @@ static const struct procfs_entry base_entry_stuff[] = {
|
|||||||
// PROC_REG("cpuinfo", S_IRUGO, NULL),
|
// PROC_REG("cpuinfo", S_IRUGO, NULL),
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_42 */
|
#endif /* POSTK_DEBUG_ARCH_DEP_42 */
|
||||||
// PROC_REG("meminfo", S_IRUGO, NULL),
|
// PROC_REG("meminfo", S_IRUGO, NULL),
|
||||||
|
PROC_REG("meminfo", S_IRUGO, &mckernel_buff_io),
|
||||||
// PROC_REG("pagetypeinfo",S_IRUGO, NULL),
|
// PROC_REG("pagetypeinfo",S_IRUGO, NULL),
|
||||||
// PROC_REG("softirq", S_IRUGO, NULL),
|
// PROC_REG("softirq", S_IRUGO, NULL),
|
||||||
PROC_REG("stat", 0444, &mckernel_buff_io),
|
PROC_REG("stat", 0444, &mckernel_buff_io),
|
||||||
|
|||||||
@@ -43,6 +43,8 @@
|
|||||||
#include <linux/semaphore.h>
|
#include <linux/semaphore.h>
|
||||||
#include <linux/spinlock.h>
|
#include <linux/spinlock.h>
|
||||||
#include <linux/mount.h>
|
#include <linux/mount.h>
|
||||||
|
#include <linux/kdev_t.h>
|
||||||
|
#include <linux/hugetlb.h>
|
||||||
#include <asm/uaccess.h>
|
#include <asm/uaccess.h>
|
||||||
#include <asm/delay.h>
|
#include <asm/delay.h>
|
||||||
#include <asm/io.h>
|
#include <asm/io.h>
|
||||||
@@ -178,8 +180,8 @@ int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data *ppd, void *data)
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef POSTK_DEBUG_ARCH_DEP_56 /* Strange how to use inline declaration fix. */
|
struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd,
|
||||||
struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd, struct task_struct *task)
|
struct task_struct *task)
|
||||||
{
|
{
|
||||||
struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL;
|
struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL;
|
||||||
int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK);
|
int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK);
|
||||||
@@ -208,7 +210,6 @@ struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc
|
|||||||
read_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags);
|
read_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags);
|
||||||
return ptd;
|
return ptd;
|
||||||
}
|
}
|
||||||
#endif /* !POSTK_DEBUG_ARCH_DEP_56 */
|
|
||||||
|
|
||||||
static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet,
|
static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet,
|
||||||
struct syscall_response *res)
|
struct syscall_response *res)
|
||||||
@@ -237,8 +238,7 @@ static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet
|
|||||||
|
|
||||||
/* Wait until the status goes back to IHK_SCD_REQ_THREAD_SPINNING or
|
/* Wait until the status goes back to IHK_SCD_REQ_THREAD_SPINNING or
|
||||||
IHK_SCD_REQ_THREAD_DESCHEDULED because two wake-up attempts are competing.
|
IHK_SCD_REQ_THREAD_DESCHEDULED because two wake-up attempts are competing.
|
||||||
Note that mcexec_terminate_thread() and remote page fault and
|
Note that mcexec_terminate_thread() and returning EINTR would compete. */
|
||||||
returning EINTR would compete. */
|
|
||||||
if (res->req_thread_status == IHK_SCD_REQ_THREAD_TO_BE_WOKEN) {
|
if (res->req_thread_status == IHK_SCD_REQ_THREAD_TO_BE_WOKEN) {
|
||||||
printk("%s: INFO: someone else is waking up the McKernel thread, "
|
printk("%s: INFO: someone else is waking up the McKernel thread, "
|
||||||
"pid: %d, req status: %lu, syscall nr: %lu\n",
|
"pid: %d, req status: %lu, syscall nr: %lu\n",
|
||||||
@@ -273,7 +273,7 @@ long syscall_backward(struct mcctrl_usrdata *usrdata, int num,
|
|||||||
unsigned long *ret)
|
unsigned long *ret)
|
||||||
{
|
{
|
||||||
struct ikc_scd_packet *packet;
|
struct ikc_scd_packet *packet;
|
||||||
struct ikc_scd_packet *free_packet = NULL;
|
struct ikc_scd_packet *free_packet = NULL;
|
||||||
struct syscall_request *req;
|
struct syscall_request *req;
|
||||||
struct syscall_response *resp;
|
struct syscall_response *resp;
|
||||||
unsigned long syscall_ret;
|
unsigned long syscall_ret;
|
||||||
@@ -282,15 +282,16 @@ long syscall_backward(struct mcctrl_usrdata *usrdata, int num,
|
|||||||
struct mcctrl_per_proc_data *ppd;
|
struct mcctrl_per_proc_data *ppd;
|
||||||
struct mcctrl_per_thread_data *ptd;
|
struct mcctrl_per_thread_data *ptd;
|
||||||
unsigned long phys;
|
unsigned long phys;
|
||||||
struct syscall_request _request[2];
|
struct syscall_request *request = NULL;
|
||||||
struct syscall_request *request;
|
|
||||||
int retry;
|
int retry;
|
||||||
|
|
||||||
if (((unsigned long)_request ^ (unsigned long)(_request + 1)) &
|
request = kmalloc(sizeof(struct syscall_request), GFP_ATOMIC);
|
||||||
~(PAGE_SIZE -1))
|
if (!request) {
|
||||||
request = _request + 1;
|
printk("%s: ERROR: allocating request\n", __func__);
|
||||||
else
|
syscall_ret = -ENOMEM;
|
||||||
request = _request;
|
goto no_ppd;
|
||||||
|
}
|
||||||
|
|
||||||
request->number = num;
|
request->number = num;
|
||||||
request->args[0] = arg1;
|
request->args[0] = arg1;
|
||||||
request->args[1] = arg2;
|
request->args[1] = arg2;
|
||||||
@@ -305,8 +306,9 @@ long syscall_backward(struct mcctrl_usrdata *usrdata, int num,
|
|||||||
|
|
||||||
if (!ppd) {
|
if (!ppd) {
|
||||||
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
|
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
|
||||||
__FUNCTION__, task_tgid_vnr(current));
|
__func__, task_tgid_vnr(current));
|
||||||
return -EINVAL;
|
syscall_ret = -EINVAL;
|
||||||
|
goto no_ppd;
|
||||||
}
|
}
|
||||||
|
|
||||||
ptd = mcctrl_get_per_thread_data(ppd, current);
|
ptd = mcctrl_get_per_thread_data(ppd, current);
|
||||||
@@ -454,11 +456,13 @@ out:
|
|||||||
out_put_ppd:
|
out_put_ppd:
|
||||||
mcctrl_put_per_thread_data(ptd);
|
mcctrl_put_per_thread_data(ptd);
|
||||||
pr_ptd("put", task_pid_vnr(current), ptd);
|
pr_ptd("put", task_pid_vnr(current), ptd);
|
||||||
no_ptd:
|
no_ptd:
|
||||||
dprintk("%s: tid: %d, syscall: %d, syscall_ret: %lx\n",
|
dprintk("%s: tid: %d, syscall: %d, syscall_ret: %lx\n",
|
||||||
__FUNCTION__, task_pid_vnr(current), num, syscall_ret);
|
__FUNCTION__, task_pid_vnr(current), num, syscall_ret);
|
||||||
|
|
||||||
mcctrl_put_per_proc_data(ppd);
|
mcctrl_put_per_proc_data(ppd);
|
||||||
|
no_ppd:
|
||||||
|
kfree(request);
|
||||||
return syscall_ret;
|
return syscall_ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -479,214 +483,43 @@ extern struct host_thread *host_threads;
|
|||||||
extern rwlock_t host_thread_lock;
|
extern rwlock_t host_thread_lock;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, uint64_t reason)
|
int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr,
|
||||||
|
uint64_t reason, struct mcctrl_per_proc_data *ppd,
|
||||||
|
struct ikc_scd_packet *packet)
|
||||||
{
|
{
|
||||||
struct ikc_scd_packet *packet;
|
|
||||||
struct ikc_scd_packet *free_packet = NULL;
|
|
||||||
struct syscall_request *req;
|
|
||||||
struct syscall_response *resp;
|
|
||||||
int error;
|
int error;
|
||||||
struct wait_queue_head_list_node *wqhln;
|
struct mcctrl_wakeup_desc *desc;
|
||||||
unsigned long irqflags;
|
int do_frees = 1;
|
||||||
struct mcctrl_per_proc_data *ppd;
|
|
||||||
struct mcctrl_per_thread_data *ptd;
|
|
||||||
unsigned long phys;
|
|
||||||
int retry;
|
|
||||||
|
|
||||||
dprintk("%s: tid: %d, fault_addr: %p, reason: %lu\n",
|
dprintk("%s: tid: %d, fault_addr: %p, reason: %lu\n",
|
||||||
__FUNCTION__, task_pid_vnr(current), fault_addr, (unsigned long)reason);
|
__FUNCTION__, task_pid_vnr(current), fault_addr, (unsigned long)reason);
|
||||||
|
|
||||||
/* Look up per-process structure */
|
|
||||||
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
|
|
||||||
|
|
||||||
if (!ppd) {
|
|
||||||
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
|
|
||||||
__FUNCTION__, task_tgid_vnr(current));
|
|
||||||
return -EINVAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
ptd = mcctrl_get_per_thread_data(ppd, current);
|
|
||||||
if (!ptd) {
|
|
||||||
printk("%s: ERROR: mcctrl_get_per_thread_data failed\n", __FUNCTION__);
|
|
||||||
error = -ENOENT;
|
|
||||||
goto no_ptd;
|
|
||||||
}
|
|
||||||
pr_ptd("get", task_pid_vnr(current), ptd);
|
|
||||||
packet = (struct ikc_scd_packet *)ptd->data;
|
|
||||||
if (!packet) {
|
|
||||||
printk("%s: no packet registered for TID %d\n",
|
|
||||||
__FUNCTION__, task_pid_vnr(current));
|
|
||||||
error = -ENOENT;
|
|
||||||
goto out_put_ppd;
|
|
||||||
}
|
|
||||||
|
|
||||||
req = &packet->req;
|
|
||||||
|
|
||||||
/* Map response structure */
|
|
||||||
phys = ihk_device_map_memory(ihk_os_to_dev(usrdata->os),
|
|
||||||
packet->resp_pa, sizeof(*resp));
|
|
||||||
resp = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os),
|
|
||||||
phys, sizeof(*resp), NULL, 0);
|
|
||||||
if (!resp) {
|
|
||||||
printk("%s: ERROR: invalid response structure address\n",
|
|
||||||
__FUNCTION__);
|
|
||||||
error = -EINVAL;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
retry_alloc:
|
|
||||||
wqhln = kmalloc(sizeof(*wqhln), GFP_ATOMIC);
|
|
||||||
if (!wqhln) {
|
|
||||||
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
|
|
||||||
goto retry_alloc;
|
|
||||||
}
|
|
||||||
memset(wqhln, 0, sizeof(struct wait_queue_head_list_node));
|
|
||||||
|
|
||||||
/* Prepare per-thread wait queue head */
|
|
||||||
wqhln->task = current;
|
|
||||||
/* Save the TID explicitly, because mcexec_syscall(), where the request
|
|
||||||
* will be matched, is in IRQ context and can't call task_pid_vnr() */
|
|
||||||
wqhln->rtid = task_pid_vnr(current);
|
|
||||||
wqhln->req = 0;
|
|
||||||
init_waitqueue_head(&wqhln->wq_syscall);
|
|
||||||
|
|
||||||
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
|
|
||||||
/* Add to exact list */
|
|
||||||
list_add_tail(&wqhln->list, &ppd->wq_list_exact);
|
|
||||||
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
|
|
||||||
|
|
||||||
/* Request page fault */
|
/* Request page fault */
|
||||||
resp->ret = -EFAULT;
|
packet->msg = SCD_MSG_REMOTE_PAGE_FAULT;
|
||||||
resp->fault_address = (unsigned long)fault_addr;
|
packet->fault_address = (unsigned long)fault_addr;
|
||||||
resp->fault_reason = reason;
|
packet->fault_reason = reason;
|
||||||
resp->stid = task_pid_vnr(current);
|
|
||||||
|
|
||||||
#define STATUS_PAGER_COMPLETED 1
|
/* we need to alloc desc ourselves because GFP_ATOMIC */
|
||||||
#define STATUS_PAGE_FAULT 3
|
retry_alloc:
|
||||||
req->valid = 0;
|
desc = kmalloc(sizeof(*desc), GFP_ATOMIC);
|
||||||
|
if (!desc) {
|
||||||
if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) {
|
pr_warn("WARNING: coudln't alloc remote page fault wait desc, retrying..\n");
|
||||||
printk("%s: WARNING: failed to notify PID %d\n",
|
goto retry_alloc;
|
||||||
__FUNCTION__, packet->pid);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
mb();
|
/* packet->target_cpu was set in rus_vm_fault if a thread was found */
|
||||||
resp->status = STATUS_PAGE_FAULT;
|
error = mcctrl_ikc_send_wait(usrdata->os, packet->target_cpu, packet,
|
||||||
|
0, desc, &do_frees, 0);
|
||||||
retry = 0;
|
if (do_frees)
|
||||||
for (;;) {
|
kfree(desc);
|
||||||
dprintk("%s: tid: %d, fault_addr: %p SLEEPING\n",
|
if (error < 0) {
|
||||||
__FUNCTION__, task_pid_vnr(current), fault_addr);
|
pr_warn("%s: WARNING: failed to request remote page fault PID %d: %d\n",
|
||||||
/* wait for response */
|
__func__, packet->pid, error);
|
||||||
error = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
|
|
||||||
|
|
||||||
/* Delay signal handling */
|
|
||||||
if (error == -ERESTARTSYS) {
|
|
||||||
printk("%s: INFO: interrupted by signal\n", __FUNCTION__);
|
|
||||||
retry++;
|
|
||||||
if (retry < 5) { /* mcexec is alive */
|
|
||||||
printk("%s: INFO: retry=%d\n", __FUNCTION__, retry);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Remove per-thread wait queue head */
|
|
||||||
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
|
|
||||||
list_del(&wqhln->list);
|
|
||||||
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
|
|
||||||
|
|
||||||
dprintk("%s: tid: %d, fault_addr: %p WOKEN UP\n",
|
|
||||||
__FUNCTION__, task_pid_vnr(current), fault_addr);
|
|
||||||
|
|
||||||
if (retry >= 5) {
|
|
||||||
kfree(wqhln);
|
|
||||||
kprintf("%s: INFO: mcexec is gone or retry count exceeded,pid=%d,retry=%d\n", __FUNCTION__, task_tgid_vnr(current), retry);
|
|
||||||
error = -EINVAL;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (error) {
|
|
||||||
kfree(wqhln);
|
|
||||||
printk("remote_page_fault:interrupted. %d\n", error);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
/* Update packet reference */
|
|
||||||
packet = wqhln->packet;
|
|
||||||
free_packet = packet;
|
|
||||||
req = &packet->req;
|
|
||||||
{
|
|
||||||
unsigned long phys2;
|
|
||||||
struct syscall_response *resp2;
|
|
||||||
phys2 = ihk_device_map_memory(ihk_os_to_dev(usrdata->os),
|
|
||||||
packet->resp_pa, sizeof(*resp));
|
|
||||||
resp2 = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os),
|
|
||||||
phys2, sizeof(*resp), NULL, 0);
|
|
||||||
|
|
||||||
if (resp != resp2) {
|
|
||||||
resp = resp2;
|
|
||||||
phys = phys2;
|
|
||||||
printk("%s: updated new remote PA for resp\n", __FUNCTION__);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!req->valid) {
|
|
||||||
printk("remote_page_fault:not valid\n");
|
|
||||||
}
|
|
||||||
req->valid = 0;
|
|
||||||
|
|
||||||
/* check result */
|
|
||||||
if (req->number != __NR_mmap) {
|
|
||||||
printk("remote_page_fault:unexpected response. %lx %lx\n",
|
|
||||||
req->number, req->args[0]);
|
|
||||||
error = -EIO;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
#define PAGER_REQ_RESUME 0x0101
|
|
||||||
else if (req->args[0] != PAGER_REQ_RESUME) {
|
|
||||||
resp->ret = pager_call(usrdata->os, (void *)req);
|
|
||||||
|
|
||||||
if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) {
|
|
||||||
printk("%s: WARNING: failed to notify PID %d\n",
|
|
||||||
__FUNCTION__, packet->pid);
|
|
||||||
}
|
|
||||||
|
|
||||||
mb();
|
|
||||||
resp->status = STATUS_PAGER_COMPLETED;
|
|
||||||
break;
|
|
||||||
//continue;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
error = req->args[1];
|
|
||||||
if (error) {
|
|
||||||
printk("remote_page_fault:response %d\n", error);
|
|
||||||
kfree(wqhln);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
kfree(wqhln);
|
|
||||||
error = 0;
|
|
||||||
out:
|
|
||||||
/* Release remote page-fault response packet */
|
|
||||||
if (free_packet) {
|
|
||||||
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)free_packet);
|
|
||||||
}
|
|
||||||
|
|
||||||
ihk_device_unmap_virtual(ihk_os_to_dev(usrdata->os), resp, sizeof(*resp));
|
|
||||||
ihk_device_unmap_memory(ihk_os_to_dev(usrdata->os), phys, sizeof(*resp));
|
|
||||||
|
|
||||||
out_put_ppd:
|
|
||||||
mcctrl_put_per_thread_data(ptd);
|
|
||||||
pr_ptd("put", task_pid_vnr(current), ptd);
|
|
||||||
no_ptd:
|
|
||||||
dprintk("%s: tid: %d, fault_addr: %p, reason: %lu, error: %d\n",
|
dprintk("%s: tid: %d, fault_addr: %p, reason: %lu, error: %d\n",
|
||||||
__FUNCTION__, task_pid_vnr(current), fault_addr, (unsigned long)reason, error);
|
__func__, task_pid_vnr(current), fault_addr,
|
||||||
|
(unsigned long)reason, error);
|
||||||
mcctrl_put_per_proc_data(ppd);
|
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -726,70 +559,70 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
|||||||
#endif
|
#endif
|
||||||
struct mcctrl_per_proc_data *ppd;
|
struct mcctrl_per_proc_data *ppd;
|
||||||
struct mcctrl_per_thread_data *ptd;
|
struct mcctrl_per_thread_data *ptd;
|
||||||
struct ikc_scd_packet *packet;
|
struct task_struct *task = current;
|
||||||
|
struct ikc_scd_packet packet = { };
|
||||||
|
unsigned long rsysnum = 0;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||||
dprintk("mcctrl:page fault:flags %#x pgoff %#lx va %#lx page %p\n",
|
unsigned long addr = vmf->address;
|
||||||
vmf->flags, vmf->pgoff, vmf->address, vmf->page);
|
|
||||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||||
dprintk("mcctrl:page fault:flags %#x pgoff %#lx va %p page %p\n",
|
void __user *addr = vmf->virtual_address;
|
||||||
vmf->flags, vmf->pgoff, vmf->virtual_address, vmf->page);
|
|
||||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||||
|
|
||||||
/* Look up per-process structure */
|
/* Look up per-process structure */
|
||||||
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
|
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(task));
|
||||||
if (!ppd) {
|
if (!ppd) {
|
||||||
kprintf("%s: INFO: no per-process structure for pid %d (tid %d), try to use pid %d\n",
|
pr_err("%s: INFO: no per-process structure for "
|
||||||
__FUNCTION__, task_tgid_vnr(current), task_pid_vnr(current), vma->vm_mm->owner->pid);
|
"pid %d (tid %d), trying to use pid %d\n",
|
||||||
ppd = mcctrl_get_per_proc_data(usrdata, vma->vm_mm->owner->pid);
|
__func__,
|
||||||
|
task_tgid_vnr(task), task_pid_vnr(task),
|
||||||
|
vma->vm_mm->owner->pid);
|
||||||
|
task = vma->vm_mm->owner;
|
||||||
|
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(task));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ppd) {
|
if (!ppd) {
|
||||||
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
|
pr_err("%s: ERROR: no per-process structure for PID %d??\n",
|
||||||
__FUNCTION__, task_tgid_vnr(current));
|
__func__, task_tgid_vnr(task));
|
||||||
ret = VM_FAULT_SIGBUS;
|
ret = VM_FAULT_SIGBUS;
|
||||||
goto no_ppd;
|
goto no_ppd;
|
||||||
}
|
}
|
||||||
|
packet.fault_tid = ppd->pid;
|
||||||
|
|
||||||
ptd = mcctrl_get_per_thread_data(ppd, current);
|
ptd = mcctrl_get_per_thread_data(ppd, task);
|
||||||
if (!ptd) {
|
if (ptd) {
|
||||||
printk("%s: ERROR: mcctrl_get_per_thread_data failed\n", __FUNCTION__);
|
struct ikc_scd_packet *ptd_packet;
|
||||||
ret = VM_FAULT_SIGBUS;
|
|
||||||
goto no_ptd;
|
pr_ptd("get", task_pid_vnr(task), ptd);
|
||||||
|
ptd_packet = (struct ikc_scd_packet *)ptd->data;
|
||||||
|
if (ptd_packet) {
|
||||||
|
packet.target_cpu = ptd_packet->ref;
|
||||||
|
packet.fault_tid = ptd_packet->req.rtid;
|
||||||
|
rsysnum = ptd_packet->req.number;
|
||||||
|
}
|
||||||
|
mcctrl_put_per_thread_data(ptd);
|
||||||
|
pr_ptd("put", task_pid_vnr(task), ptd);
|
||||||
}
|
}
|
||||||
pr_ptd("get", task_pid_vnr(current), ptd);
|
|
||||||
packet = (struct ikc_scd_packet *)ptd->data;
|
/* Don't even bother looking up NULL */
|
||||||
if (!packet) {
|
if (!addr) {
|
||||||
|
pr_warn("%s: WARNING: attempted NULL pointer access\n",
|
||||||
|
__func__);
|
||||||
ret = VM_FAULT_SIGBUS;
|
ret = VM_FAULT_SIGBUS;
|
||||||
printk("%s: no packet registered for TID %d\n",
|
|
||||||
__FUNCTION__, task_pid_vnr(current));
|
|
||||||
goto put_and_out;
|
goto put_and_out;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (try = 1; ; ++try) {
|
for (try = 1; ; ++try) {
|
||||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
|
||||||
error = translate_rva_to_rpa(usrdata->os, ppd->rpgtable,
|
error = translate_rva_to_rpa(usrdata->os, ppd->rpgtable,
|
||||||
vmf->address, &rpa, &pgsize);
|
(unsigned long)addr, &rpa, &pgsize);
|
||||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
|
||||||
error = translate_rva_to_rpa(usrdata->os, ppd->rpgtable,
|
|
||||||
(unsigned long)vmf->virtual_address,
|
|
||||||
&rpa, &pgsize);
|
|
||||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
|
||||||
#define NTRIES 2
|
#define NTRIES 2
|
||||||
if (!error || (try >= NTRIES)) {
|
if (!error || (try >= NTRIES)) {
|
||||||
if (error) {
|
if (error) {
|
||||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
pr_err("%s: error translating 0x%#lx "
|
||||||
printk("%s: error translating 0x%#lx "
|
"(req: TID: %u, syscall: %lu)\n",
|
||||||
"(req: TID: %u, syscall: %lu)\n",
|
__func__,
|
||||||
__FUNCTION__, vmf->address,
|
(unsigned long)addr,
|
||||||
packet->req.rtid, packet->req.number);
|
packet.fault_tid, rsysnum);
|
||||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
|
||||||
printk("%s: error translating 0x%p "
|
|
||||||
"(req: TID: %u, syscall: %lu)\n",
|
|
||||||
__FUNCTION__, vmf->virtual_address,
|
|
||||||
packet->req.rtid, packet->req.number);
|
|
||||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
@@ -800,23 +633,14 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
|||||||
#define PF_WRITE 0x02
|
#define PF_WRITE 0x02
|
||||||
reason |= PF_WRITE;
|
reason |= PF_WRITE;
|
||||||
}
|
}
|
||||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
error = remote_page_fault(usrdata, (void *)addr,
|
||||||
error = remote_page_fault(usrdata, (void *)vmf->address, reason);
|
reason, ppd, &packet);
|
||||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
|
||||||
error = remote_page_fault(usrdata, vmf->virtual_address, reason);
|
|
||||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
|
||||||
if (error) {
|
if (error) {
|
||||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
pr_err("%s: error forwarding PF for 0x%#lx "
|
||||||
printk("%s: error forwarding PF for 0x%#lx "
|
"(req: TID: %d, syscall: %lu)\n",
|
||||||
"(req: TID: %d, syscall: %lu)\n",
|
__func__,
|
||||||
__FUNCTION__, vmf->address,
|
(unsigned long)addr,
|
||||||
packet->req.rtid, packet->req.number);
|
packet.fault_tid, rsysnum);
|
||||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
|
||||||
printk("%s: error forwarding PF for 0x%p "
|
|
||||||
"(req: TID: %d, syscall: %lu)\n",
|
|
||||||
__FUNCTION__, vmf->virtual_address,
|
|
||||||
packet->req.rtid, packet->req.number);
|
|
||||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -825,11 +649,7 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
|||||||
goto put_and_out;
|
goto put_and_out;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
rva = (unsigned long)addr & ~(pgsize - 1);
|
||||||
rva = vmf->address & ~(pgsize - 1);
|
|
||||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
|
||||||
rva = (unsigned long)vmf->virtual_address & ~(pgsize - 1);
|
|
||||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
|
||||||
rpa = rpa & ~(pgsize - 1);
|
rpa = rpa & ~(pgsize - 1);
|
||||||
|
|
||||||
phys = ihk_device_map_memory(dev, rpa, pgsize);
|
phys = ihk_device_map_memory(dev, rpa, pgsize);
|
||||||
@@ -849,21 +669,13 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
|||||||
|
|
||||||
error = vm_insert_page(vma, rva+(pix*PAGE_SIZE), page);
|
error = vm_insert_page(vma, rva+(pix*PAGE_SIZE), page);
|
||||||
if (error) {
|
if (error) {
|
||||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
pr_err("%s: error inserting mapping for 0x%#lx "
|
||||||
printk("%s: error inserting mapping for 0x%#lx "
|
"(req: TID: %d, syscall: %lu) error: %d,"
|
||||||
"(req: TID: %d, syscall: %lu) error: %d, "
|
" vm_start: 0x%lx, vm_end: 0x%lx\n",
|
||||||
"vm_start: 0x%lx, vm_end: 0x%lx\n",
|
__func__,
|
||||||
__FUNCTION__, vmf->address,
|
(unsigned long)addr, packet.fault_tid,
|
||||||
packet->req.rtid, packet->req.number, error,
|
rsysnum, error,
|
||||||
vma->vm_start, vma->vm_end);
|
vma->vm_start, vma->vm_end);
|
||||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
|
||||||
printk("%s: error inserting mapping for 0x%p "
|
|
||||||
"(req: TID: %d, syscall: %lu) error: %d, "
|
|
||||||
"vm_start: 0x%lx, vm_end: 0x%lx\n",
|
|
||||||
__FUNCTION__, vmf->virtual_address,
|
|
||||||
packet->req.rtid, packet->req.number, error,
|
|
||||||
vma->vm_start, vma->vm_end);
|
|
||||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -875,16 +687,13 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
|||||||
pfn+pix);
|
pfn+pix);
|
||||||
#endif
|
#endif
|
||||||
if (error) {
|
if (error) {
|
||||||
#if 1 /* POSTK_DEBUG_TEMP_FIX_11 */ /* rus_vm_fault() multi-thread fix */
|
pr_err("%s: vm_insert_pfn returned %d\n",
|
||||||
printk("%s: vm_insert_pfn returned %d\n", __FUNCTION__, error);
|
__func__, error);
|
||||||
if (error == -EBUSY) {
|
if (error == -EBUSY) {
|
||||||
error = 0;
|
error = 0;
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#else /* POSTK_DEBUG_TEMP_FIX_11 */
|
|
||||||
break;
|
|
||||||
#endif /* POSTK_DEBUG_TEMP_FIX_11 */
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
@@ -892,17 +701,11 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
|||||||
#endif
|
#endif
|
||||||
ihk_device_unmap_memory(dev, phys, pgsize);
|
ihk_device_unmap_memory(dev, phys, pgsize);
|
||||||
if (error) {
|
if (error) {
|
||||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
pr_err("%s: remote PF failed for 0x%#lx, pgoff: %lu"
|
||||||
printk("%s: remote PF failed for 0x%#lx, pgoff: %lu "
|
" (req: TID: %d, syscall: %lu)\n",
|
||||||
"(req: TID: %d, syscall: %lu)\n",
|
__func__,
|
||||||
__FUNCTION__, vmf->address, vmf->pgoff,
|
(unsigned long)addr, vmf->pgoff,
|
||||||
packet->req.rtid, packet->req.number);
|
packet.fault_tid, rsysnum);
|
||||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
|
||||||
printk("%s: remote PF failed for 0x%p, pgoff: %lu "
|
|
||||||
"(req: TID: %d, syscall: %lu)\n",
|
|
||||||
__FUNCTION__, vmf->virtual_address, vmf->pgoff,
|
|
||||||
packet->req.rtid, packet->req.number);
|
|
||||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
|
||||||
ret = VM_FAULT_SIGBUS;
|
ret = VM_FAULT_SIGBUS;
|
||||||
goto put_and_out;
|
goto put_and_out;
|
||||||
}
|
}
|
||||||
@@ -910,9 +713,6 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
|||||||
ret = VM_FAULT_NOPAGE;
|
ret = VM_FAULT_NOPAGE;
|
||||||
|
|
||||||
put_and_out:
|
put_and_out:
|
||||||
mcctrl_put_per_thread_data(ptd);
|
|
||||||
pr_ptd("put", task_pid_vnr(current), ptd);
|
|
||||||
no_ptd:
|
|
||||||
mcctrl_put_per_proc_data(ppd);
|
mcctrl_put_per_proc_data(ppd);
|
||||||
no_ppd:
|
no_ppd:
|
||||||
return ret;
|
return ret;
|
||||||
@@ -1075,6 +875,7 @@ struct pager_create_result {
|
|||||||
int maxprot;
|
int maxprot;
|
||||||
uint32_t flags;
|
uint32_t flags;
|
||||||
size_t size;
|
size_t size;
|
||||||
|
int pgshift;
|
||||||
char path[PATH_MAX];
|
char path[PATH_MAX];
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -1136,6 +937,7 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
|||||||
struct kstat st;
|
struct kstat st;
|
||||||
int mf_flags = 0;
|
int mf_flags = 0;
|
||||||
unsigned long irqflags;
|
unsigned long irqflags;
|
||||||
|
int pgshift = 0;
|
||||||
|
|
||||||
dprintk("pager_req_create(%d,%lx)\n", fd, (long)result_pa);
|
dprintk("pager_req_create(%d,%lx)\n", fd, (long)result_pa);
|
||||||
|
|
||||||
@@ -1144,8 +946,16 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
|||||||
printk("pager_req_create(%d,%lx):vfs_stat failed. %d\n", fd, (long)result_pa, error);
|
printk("pager_req_create(%d,%lx):vfs_stat failed. %d\n", fd, (long)result_pa, error);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1)) {
|
if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1) &&
|
||||||
/* treat memory devices as regular files */
|
(MINOR(st.rdev) == 1 || // /dev/mem
|
||||||
|
MINOR(st.rdev) == 5)) { // /dev/zero
|
||||||
|
/* treat memory devices and zero devices as regular files */
|
||||||
|
}
|
||||||
|
else if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1)) {
|
||||||
|
error = -ENODEV;
|
||||||
|
dprintk("%s(%d,%lx):unmappable device %x\n",
|
||||||
|
__func__, fd, (long)result_pa, st.mode);
|
||||||
|
goto out;
|
||||||
}
|
}
|
||||||
else if (!S_ISREG(st.mode)) {
|
else if (!S_ISREG(st.mode)) {
|
||||||
error = -ESRCH;
|
error = -ESRCH;
|
||||||
@@ -1167,6 +977,10 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!strcmp(inode->i_sb->s_type->name, "tmpfs")) {
|
||||||
|
mf_flags = MF_IS_REMOVABLE;
|
||||||
|
}
|
||||||
|
|
||||||
if (!strcmp(inode->i_sb->s_type->name, "proc")) {
|
if (!strcmp(inode->i_sb->s_type->name, "proc")) {
|
||||||
error = -ESRCH;
|
error = -ESRCH;
|
||||||
goto out;
|
goto out;
|
||||||
@@ -1188,13 +1002,14 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (inode->i_op == mcctrl_hugetlbfs_inode_operations) {
|
if (inode->i_op == mcctrl_hugetlbfs_inode_operations) {
|
||||||
|
struct hstate *h = hstate_file(file);
|
||||||
|
|
||||||
|
pgshift = PAGE_SHIFT + huge_page_order(h);
|
||||||
mf_flags = MF_HUGETLBFS;
|
mf_flags = MF_HUGETLBFS;
|
||||||
/* pager is used as handle id on mckernel side, use inode */
|
/* pager is used as handle id on mckernel side, use inode */
|
||||||
pager = (void *)st.ino;
|
pager = (void *)st.ino;
|
||||||
/* retrofit blksize in resp as well through st.size field;
|
/* file size is not used */
|
||||||
* the actual file size is not used
|
st.size = 0;
|
||||||
*/
|
|
||||||
st.size = st.blksize;
|
|
||||||
goto out_reply;
|
goto out_reply;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1281,6 +1096,7 @@ out_reply:
|
|||||||
resp->maxprot = maxprot;
|
resp->maxprot = maxprot;
|
||||||
resp->flags = mf_flags;
|
resp->flags = mf_flags;
|
||||||
resp->size = st.size;
|
resp->size = st.size;
|
||||||
|
resp->pgshift = pgshift;
|
||||||
|
|
||||||
error = pager_get_path(file, resp->path);
|
error = pager_get_path(file, resp->path);
|
||||||
|
|
||||||
@@ -1355,8 +1171,9 @@ static int pager_req_read(ihk_os_t os, uintptr_t handle, off_t off, size_t size,
|
|||||||
uintptr_t phys = -1;
|
uintptr_t phys = -1;
|
||||||
ihk_device_t dev = ihk_os_to_dev(os);
|
ihk_device_t dev = ihk_os_to_dev(os);
|
||||||
void *buf = NULL;
|
void *buf = NULL;
|
||||||
loff_t pos;
|
loff_t pos, fsize;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
unsigned int major, minor;
|
||||||
|
|
||||||
dprintk("pager_req_read(%lx,%lx,%lx,%lx)\n", handle, off, size, rpa);
|
dprintk("pager_req_read(%lx,%lx,%lx,%lx)\n", handle, off, size, rpa);
|
||||||
|
|
||||||
@@ -1378,6 +1195,21 @@ static int pager_req_read(ihk_os_t os, uintptr_t handle, off_t off, size_t size,
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
major = MAJOR(file->f_mapping->host->i_rdev);
|
||||||
|
minor = MINOR(file->f_mapping->host->i_rdev);
|
||||||
|
if ((major == 1 && minor == 1) || // /dev/mem
|
||||||
|
(major == 1 && minor == 5)) { // /dev/zero
|
||||||
|
/* Nothing to check */
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/* Check if the target page fits in the file */
|
||||||
|
fsize = i_size_read(file->f_mapping->host);
|
||||||
|
if (off > fsize) {
|
||||||
|
ss = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
phys = ihk_device_map_memory(dev, rpa, size);
|
phys = ihk_device_map_memory(dev, rpa, size);
|
||||||
buf = ihk_device_map_virtual(dev, phys, size, NULL, 0);
|
buf = ihk_device_map_virtual(dev, phys, size, NULL, 0);
|
||||||
if (!buf) {
|
if (!buf) {
|
||||||
@@ -1712,16 +1544,9 @@ retry:
|
|||||||
pfn |= PFN_VALID | PFN_PRESENT;
|
pfn |= PFN_VALID | PFN_PRESENT;
|
||||||
|
|
||||||
/* Check if mapping is write-combined */
|
/* Check if mapping is write-combined */
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_12
|
|
||||||
if (pte_is_write_combined(*pte)) {
|
if (pte_is_write_combined(*pte)) {
|
||||||
pfn |= PFN_WRITE_COMBINED;
|
pfn |= PFN_WRITE_COMBINED;
|
||||||
}
|
}
|
||||||
#else /* POSTK_DEBUG_ARCH_DEP_12 */
|
|
||||||
if ((pte_flags(*pte) & _PAGE_PWT) &&
|
|
||||||
!(pte_flags(*pte) & _PAGE_PCD)) {
|
|
||||||
pfn |= _PAGE_PWT;
|
|
||||||
}
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
|
|
||||||
}
|
}
|
||||||
pte_unmap(pte);
|
pte_unmap(pte);
|
||||||
}
|
}
|
||||||
@@ -2052,6 +1877,8 @@ int mcctrl_clear_pte_range(uintptr_t start, uintptr_t len)
|
|||||||
}
|
}
|
||||||
if (addr < end) {
|
if (addr < end) {
|
||||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0)
|
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0)
|
||||||
|
/* Revert permission */
|
||||||
|
vma->vm_flags |= VM_READ | VM_WRITE | VM_EXEC;
|
||||||
error = zap_vma_ptes(vma, addr, end-addr);
|
error = zap_vma_ptes(vma, addr, end-addr);
|
||||||
if (error) {
|
if (error) {
|
||||||
mcctrl_zap_page_range(vma, addr, end-addr,
|
mcctrl_zap_page_range(vma, addr, end-addr,
|
||||||
@@ -2069,6 +1896,8 @@ int mcctrl_clear_pte_range(uintptr_t start, uintptr_t len)
|
|||||||
NULL);
|
NULL);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
/* Revert permission */
|
||||||
|
vma->vm_flags |= VM_READ | VM_WRITE | VM_EXEC;
|
||||||
zap_vma_ptes(vma, addr, end-addr);
|
zap_vma_ptes(vma, addr, end-addr);
|
||||||
}
|
}
|
||||||
#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) */
|
#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) */
|
||||||
@@ -2124,7 +1953,10 @@ int release_user_space(uintptr_t start, uintptr_t len)
|
|||||||
* \param chunks The number of chunks which make a core file image in the whole.
|
* \param chunks The number of chunks which make a core file image in the whole.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static int writecore(ihk_os_t os, unsigned long rcoretable, int chunks) {
|
static int writecore(ihk_os_t os, unsigned long rcoretable, int chunks,
|
||||||
|
unsigned long cmdline_rphys, unsigned long cmdline_len)
|
||||||
|
{
|
||||||
|
char *fn = NULL;
|
||||||
struct file *file;
|
struct file *file;
|
||||||
struct coretable *coretable;
|
struct coretable *coretable;
|
||||||
int i, tablesize, error = 0;
|
int i, tablesize, error = 0;
|
||||||
@@ -2133,22 +1965,43 @@ static int writecore(ihk_os_t os, unsigned long rcoretable, int chunks) {
|
|||||||
unsigned long phys, tablephys, rphys;
|
unsigned long phys, tablephys, rphys;
|
||||||
ihk_device_t dev = ihk_os_to_dev(os);
|
ihk_device_t dev = ihk_os_to_dev(os);
|
||||||
char *pt;
|
char *pt;
|
||||||
|
unsigned long cmdline_phys;
|
||||||
|
char *cmdline;
|
||||||
|
|
||||||
dprintk("coredump called as a pseudo syscall\n");
|
dprintk("coredump called as a pseudo syscall\n");
|
||||||
|
|
||||||
|
fn = kmalloc(PATH_MAX, GFP_ATOMIC);
|
||||||
|
if (!fn) {
|
||||||
|
dprintk("%s: ERROR: allocating file name\n", __func__);
|
||||||
|
error = -ENOMEM;
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
if (chunks <= 0) {
|
if (chunks <= 0) {
|
||||||
dprintk("no core data found!(%d)\n", chunks);
|
dprintk("no core data found!(%d)\n", chunks);
|
||||||
error = -EINVAL;
|
error = -EINVAL;
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cmdline_phys = ihk_device_map_memory(dev, cmdline_rphys, cmdline_len);
|
||||||
|
cmdline = ihk_device_map_virtual(dev, cmdline_phys, cmdline_len, NULL,
|
||||||
|
0);
|
||||||
|
sprintf(fn, "mccore-%s.%d",
|
||||||
|
strrchr(cmdline, '/') ?
|
||||||
|
strrchr(cmdline, '/') + 1 : cmdline,
|
||||||
|
task_tgid_vnr(current));
|
||||||
|
pr_info("%s: fn=%s\n", __func__, fn);
|
||||||
|
|
||||||
|
ihk_device_unmap_virtual(dev, cmdline, cmdline_len);
|
||||||
|
ihk_device_unmap_memory(dev, cmdline_phys, cmdline_len);
|
||||||
|
|
||||||
/* Every Linux documentation insists we should not
|
/* Every Linux documentation insists we should not
|
||||||
* open a file in the kernel module, but our karma
|
* open a file in the kernel module, but our karma
|
||||||
* leads us here. Precisely, Here we emulate the core
|
* leads us here. Precisely, Here we emulate the core
|
||||||
* dump routine of the Linux kernel in linux/fs/exec.c.
|
* dump routine of the Linux kernel in linux/fs/exec.c.
|
||||||
* So we have a legitimate reason to do this.
|
* So we have a legitimate reason to do this.
|
||||||
*/
|
*/
|
||||||
file = filp_open("core", O_CREAT | O_RDWR | O_LARGEFILE | O_TRUNC, 0600);
|
file = filp_open(fn, O_CREAT | O_RDWR | O_LARGEFILE | O_TRUNC, 0600);
|
||||||
if (IS_ERR(file) || !file->f_op) {
|
if (IS_ERR(file) || !file->f_op) {
|
||||||
dprintk("cannot open core file\n");
|
dprintk("cannot open core file\n");
|
||||||
error = PTR_ERR(file);
|
error = PTR_ERR(file);
|
||||||
@@ -2218,6 +2071,7 @@ fail:
|
|||||||
/* make sure we do not travel to user land */
|
/* make sure we do not travel to user land */
|
||||||
error = -EINVAL;
|
error = -EINVAL;
|
||||||
}
|
}
|
||||||
|
kfree(fn);
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2273,7 +2127,8 @@ int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet)
|
|||||||
}
|
}
|
||||||
|
|
||||||
case __NR_coredump:
|
case __NR_coredump:
|
||||||
ret = writecore(os, sc->args[1], sc->args[0]);
|
ret = writecore(os, sc->args[1], sc->args[0], sc->args[2],
|
||||||
|
sc->args[3]);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case __NR_sched_setparam: {
|
case __NR_sched_setparam: {
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include <linux/kernel.h>
|
#include <linux/kernel.h>
|
||||||
|
#include <linux/hashtable.h>
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
#include <linux/uaccess.h>
|
#include <linux/uaccess.h>
|
||||||
#include <linux/version.h>
|
#include <linux/version.h>
|
||||||
@@ -23,6 +24,14 @@
|
|||||||
#define wprintk(...) do { if (1) printk(KERN_WARNING __VA_ARGS__); } while (0)
|
#define wprintk(...) do { if (1) printk(KERN_WARNING __VA_ARGS__); } while (0)
|
||||||
#define eprintk(...) do { if (1) printk(KERN_ERR __VA_ARGS__); } while (0)
|
#define eprintk(...) do { if (1) printk(KERN_ERR __VA_ARGS__); } while (0)
|
||||||
|
|
||||||
|
struct physical_core_id {
|
||||||
|
int linux_core_id;
|
||||||
|
int mckernel_core_id;
|
||||||
|
struct hlist_node next;
|
||||||
|
};
|
||||||
|
|
||||||
|
DEFINE_HASHTABLE(physical_core_id_map, 10);
|
||||||
|
|
||||||
static ssize_t
|
static ssize_t
|
||||||
show_int(struct sysfsm_ops *ops, void *instance, void *buf, size_t size)
|
show_int(struct sysfsm_ops *ops, void *instance, void *buf, size_t size)
|
||||||
{
|
{
|
||||||
@@ -157,13 +166,8 @@ static void free_node_topology(struct mcctrl_usrdata *udp)
|
|||||||
return;
|
return;
|
||||||
} /* free_node_topology() */
|
} /* free_node_topology() */
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
|
||||||
static void free_cpu_topology_one(struct mcctrl_usrdata *udp,
|
static void free_cpu_topology_one(struct mcctrl_usrdata *udp,
|
||||||
struct mcctrl_cpu_topology *cpu)
|
struct mcctrl_cpu_topology *cpu)
|
||||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
static void free_cpu_topology_one(struct mcctrl_usrdata *udp,
|
|
||||||
struct cpu_topology *cpu)
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
{
|
{
|
||||||
struct cache_topology *cache;
|
struct cache_topology *cache;
|
||||||
struct cache_topology *next;
|
struct cache_topology *next;
|
||||||
@@ -179,13 +183,8 @@ static void free_cpu_topology_one(struct mcctrl_usrdata *udp,
|
|||||||
|
|
||||||
static void free_cpu_topology(struct mcctrl_usrdata *udp)
|
static void free_cpu_topology(struct mcctrl_usrdata *udp)
|
||||||
{
|
{
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
|
||||||
struct mcctrl_cpu_topology *cpu;
|
struct mcctrl_cpu_topology *cpu;
|
||||||
struct mcctrl_cpu_topology *next;
|
struct mcctrl_cpu_topology *next;
|
||||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
struct cpu_topology *cpu;
|
|
||||||
struct cpu_topology *next;
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
|
|
||||||
list_for_each_entry_safe(cpu, next, &udp->cpu_topology_list, chain) {
|
list_for_each_entry_safe(cpu, next, &udp->cpu_topology_list, chain) {
|
||||||
list_del(&cpu->chain);
|
list_del(&cpu->chain);
|
||||||
@@ -198,6 +197,9 @@ static void free_cpu_topology(struct mcctrl_usrdata *udp)
|
|||||||
void free_topology_info(ihk_os_t os)
|
void free_topology_info(ihk_os_t os)
|
||||||
{
|
{
|
||||||
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
|
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
|
||||||
|
int bkt;
|
||||||
|
struct hlist_node *tmp;
|
||||||
|
struct physical_core_id *cur;
|
||||||
|
|
||||||
if (!udp) {
|
if (!udp) {
|
||||||
pr_warn("%s: warning: mcctrl_usrdata not found\n", __func__);
|
pr_warn("%s: warning: mcctrl_usrdata not found\n", __func__);
|
||||||
@@ -207,6 +209,11 @@ void free_topology_info(ihk_os_t os)
|
|||||||
free_node_topology(udp);
|
free_node_topology(udp);
|
||||||
free_cpu_topology(udp);
|
free_cpu_topology(udp);
|
||||||
|
|
||||||
|
hash_for_each_safe(physical_core_id_map, bkt, tmp, cur, next) {
|
||||||
|
hash_del(&cur->next);
|
||||||
|
kfree(cur);
|
||||||
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
} /* free_topology_info() */
|
} /* free_topology_info() */
|
||||||
|
|
||||||
@@ -315,13 +322,8 @@ static int translate_cpumap(struct mcctrl_usrdata *udp,
|
|||||||
return error;
|
return error;
|
||||||
} /* translate_cpumap() */
|
} /* translate_cpumap() */
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
|
||||||
static struct cache_topology *get_cache_topology(struct mcctrl_usrdata *udp,
|
static struct cache_topology *get_cache_topology(struct mcctrl_usrdata *udp,
|
||||||
struct mcctrl_cpu_topology *cpu_topo, struct ihk_cache_topology *saved)
|
struct mcctrl_cpu_topology *cpu_topo, struct ihk_cache_topology *saved)
|
||||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
static struct cache_topology *get_cache_topology(struct mcctrl_usrdata *udp,
|
|
||||||
struct cpu_topology *cpu_topo, struct ihk_cache_topology *saved)
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
{
|
{
|
||||||
int error;
|
int error;
|
||||||
struct cache_topology *topo = NULL;
|
struct cache_topology *topo = NULL;
|
||||||
@@ -355,23 +357,19 @@ out:
|
|||||||
return (error)? ERR_PTR(error): topo;
|
return (error)? ERR_PTR(error): topo;
|
||||||
} /* get_cache_topology() */
|
} /* get_cache_topology() */
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
|
||||||
static struct mcctrl_cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
|
static struct mcctrl_cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
|
||||||
int index)
|
int index)
|
||||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
|
|
||||||
int index)
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
{
|
{
|
||||||
int error;
|
int error;
|
||||||
ihk_device_t dev = ihk_os_to_dev(udp->os);
|
ihk_device_t dev = ihk_os_to_dev(udp->os);
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
|
||||||
struct mcctrl_cpu_topology *topology = NULL;
|
struct mcctrl_cpu_topology *topology = NULL;
|
||||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
struct cpu_topology *topology = NULL;
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
struct cache_topology *cache;
|
struct cache_topology *cache;
|
||||||
struct ihk_cache_topology *saved_cache;
|
struct ihk_cache_topology *saved_cache;
|
||||||
|
int linux_core_id;
|
||||||
|
int mckernel_core_id;
|
||||||
|
struct physical_core_id *entry;
|
||||||
|
struct physical_core_id *cur;
|
||||||
|
static int nr_mckernel_core;
|
||||||
|
|
||||||
dprintk("get_one_cpu_topology(%p,%d)\n", udp, index);
|
dprintk("get_one_cpu_topology(%p,%d)\n", udp, index);
|
||||||
topology = kmalloc(sizeof(*topology), GFP_KERNEL);
|
topology = kmalloc(sizeof(*topology), GFP_KERNEL);
|
||||||
@@ -387,12 +385,8 @@ static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
|
|||||||
topology->saved = ihk_device_get_cpu_topology(dev,
|
topology->saved = ihk_device_get_cpu_topology(dev,
|
||||||
mckernel_cpu_2_hw_id(udp, index));
|
mckernel_cpu_2_hw_id(udp, index));
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_TEMP_FIX_21 /* IS_ERR() through return NULL */
|
|
||||||
if (!topology->saved) {
|
if (!topology->saved) {
|
||||||
#else /* POSTK_DEBUG_TEMP_FIX_21 */
|
error = -ENOENT;
|
||||||
if (IS_ERR(topology->saved)) {
|
|
||||||
#endif /* POSTK_DEBUG_TEMP_FIX_21 */
|
|
||||||
error = PTR_ERR(topology->saved);
|
|
||||||
eprintk("mcctrl:get_one_cpu_topology:"
|
eprintk("mcctrl:get_one_cpu_topology:"
|
||||||
"ihk_device_get_cpu_topology failed. %d\n",
|
"ihk_device_get_cpu_topology failed. %d\n",
|
||||||
error);
|
error);
|
||||||
@@ -419,6 +413,22 @@ static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
linux_core_id = topology->saved->core_id;
|
||||||
|
mckernel_core_id = -1;
|
||||||
|
hash_for_each_possible(physical_core_id_map, cur, next, linux_core_id) {
|
||||||
|
mckernel_core_id = cur->mckernel_core_id;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (mckernel_core_id < 0) {
|
||||||
|
mckernel_core_id = nr_mckernel_core++;
|
||||||
|
entry = kmalloc(sizeof(struct physical_core_id), GFP_KERNEL);
|
||||||
|
entry->linux_core_id = linux_core_id;
|
||||||
|
entry->mckernel_core_id = mckernel_core_id;
|
||||||
|
hash_add(physical_core_id_map,
|
||||||
|
&entry->next, entry->linux_core_id);
|
||||||
|
}
|
||||||
|
topology->mckernel_core_id = mckernel_core_id;
|
||||||
|
|
||||||
list_for_each_entry(saved_cache,
|
list_for_each_entry(saved_cache,
|
||||||
&topology->saved->cache_topology_list, chain) {
|
&topology->saved->cache_topology_list, chain) {
|
||||||
cache = get_cache_topology(udp, topology, saved_cache);
|
cache = get_cache_topology(udp, topology, saved_cache);
|
||||||
@@ -428,6 +438,9 @@ static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
|
|||||||
"get_cache_topology failed. %d\n",
|
"get_cache_topology failed. %d\n",
|
||||||
error);
|
error);
|
||||||
goto out;
|
goto out;
|
||||||
|
} else if (!cache) {
|
||||||
|
error = -ENOENT;
|
||||||
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
list_add(&cache->chain, &topology->cache_list);
|
list_add(&cache->chain, &topology->cache_list);
|
||||||
@@ -447,11 +460,7 @@ static int get_cpu_topology(struct mcctrl_usrdata *udp)
|
|||||||
{
|
{
|
||||||
int error;
|
int error;
|
||||||
int index;
|
int index;
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
|
||||||
struct mcctrl_cpu_topology *topology;
|
struct mcctrl_cpu_topology *topology;
|
||||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
struct cpu_topology *topology;
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
|
|
||||||
dprintk("get_cpu_topology(%p)\n", udp);
|
dprintk("get_cpu_topology(%p)\n", udp);
|
||||||
for (index = 0; index < udp->cpu_info->n_cpus; ++index) {
|
for (index = 0; index < udp->cpu_info->n_cpus; ++index) {
|
||||||
@@ -473,13 +482,8 @@ out:
|
|||||||
return error;
|
return error;
|
||||||
} /* get_cpu_topology() */
|
} /* get_cpu_topology() */
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
|
||||||
static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
|
static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
|
||||||
struct mcctrl_cpu_topology *cpu, struct cache_topology *cache)
|
struct mcctrl_cpu_topology *cpu, struct cache_topology *cache)
|
||||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
|
|
||||||
struct cpu_topology *cpu, struct cache_topology *cache)
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
{
|
{
|
||||||
char *prefix = "/sys/devices/system/cpu";
|
char *prefix = "/sys/devices/system/cpu";
|
||||||
int cpu_number = cpu->mckernel_cpu_id;
|
int cpu_number = cpu->mckernel_cpu_id;
|
||||||
@@ -531,13 +535,8 @@ static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
|
|||||||
return;
|
return;
|
||||||
} /* setup_cpu_sysfs_cache_files() */
|
} /* setup_cpu_sysfs_cache_files() */
|
||||||
|
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
|
||||||
static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
|
static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
|
||||||
struct mcctrl_cpu_topology *cpu)
|
struct mcctrl_cpu_topology *cpu)
|
||||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
|
|
||||||
struct cpu_topology *cpu)
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
{
|
{
|
||||||
char *prefix = "/sys/devices/system/cpu";
|
char *prefix = "/sys/devices/system/cpu";
|
||||||
int cpu_number = cpu->mckernel_cpu_id;
|
int cpu_number = cpu->mckernel_cpu_id;
|
||||||
@@ -551,7 +550,7 @@ static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
|
|||||||
"%s/cpu%d/topology/physical_package_id",
|
"%s/cpu%d/topology/physical_package_id",
|
||||||
prefix, cpu_number);
|
prefix, cpu_number);
|
||||||
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_d32,
|
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_d32,
|
||||||
&cpu->saved->core_id, 0444,
|
&cpu->mckernel_core_id, 0444,
|
||||||
"%s/cpu%d/topology/core_id",
|
"%s/cpu%d/topology/core_id",
|
||||||
prefix, cpu_number);
|
prefix, cpu_number);
|
||||||
|
|
||||||
@@ -586,11 +585,7 @@ static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
|
|||||||
static void setup_cpus_sysfs_files(struct mcctrl_usrdata *udp)
|
static void setup_cpus_sysfs_files(struct mcctrl_usrdata *udp)
|
||||||
{
|
{
|
||||||
int error;
|
int error;
|
||||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
|
||||||
struct mcctrl_cpu_topology *cpu;
|
struct mcctrl_cpu_topology *cpu;
|
||||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
struct cpu_topology *cpu;
|
|
||||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
|
||||||
|
|
||||||
error = get_cpu_topology(udp);
|
error = get_cpu_topology(udp);
|
||||||
if (error) {
|
if (error) {
|
||||||
|
|||||||
@@ -1,14 +0,0 @@
|
|||||||
# LESS/GREATER_EQUAL appears somewhere in 3.7... meh compat until we stop caring about 2.x
|
|
||||||
# ...apparently can't define macros ot use inside if, so unfold manually
|
|
||||||
|
|
||||||
if(NOT (LINUX_VERSION_CODE LESS 262144) AND NOT (LINUX_VERSION_CODE GREATER 262400))
|
|
||||||
add_subdirectory("linux-4.0.9")
|
|
||||||
elseif(NOT (LINUX_VERSION_CODE LESS 263680) AND NOT (LINUX_VERSION_CODE GREATER 263936))
|
|
||||||
add_subdirectory("linux-4.6.7")
|
|
||||||
elseif(LINUX_VERSION_CODE EQUAL 199168)
|
|
||||||
add_subdirectory("linux-3.10.0-327.36.1.el7")
|
|
||||||
else()
|
|
||||||
#add_subdirectory("linux-3.10.0-327.36.1.el7")
|
|
||||||
add_subdirectory("linux-4.18.14")
|
|
||||||
#message(FATAL_ERROR "mcoverlayfs enabled but kernel version not compatible")
|
|
||||||
endif()
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
kmod(mcoverlay
|
|
||||||
SOURCES
|
|
||||||
copy_up.c dir.c inode.c readdir.c super.c
|
|
||||||
INSTALL_DEST
|
|
||||||
${KMODDIR}
|
|
||||||
)
|
|
||||||
|
|
||||||
@@ -1,461 +0,0 @@
|
|||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/module.h>
|
|
||||||
#include <linux/fs.h>
|
|
||||||
#include <linux/slab.h>
|
|
||||||
#include <linux/file.h>
|
|
||||||
#include <linux/splice.h>
|
|
||||||
#include <linux/xattr.h>
|
|
||||||
#include <linux/security.h>
|
|
||||||
#include <linux/uaccess.h>
|
|
||||||
#include <linux/sched.h>
|
|
||||||
#include <linux/namei.h>
|
|
||||||
#include <linux/fdtable.h>
|
|
||||||
#include <linux/ratelimit.h>
|
|
||||||
#include "overlayfs.h"
|
|
||||||
|
|
||||||
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
|
|
||||||
|
|
||||||
static unsigned ovl_check_copy_up = 1;
|
|
||||||
module_param_named(check_copy_up, ovl_check_copy_up, uint,
|
|
||||||
S_IWUSR | S_IRUGO);
|
|
||||||
MODULE_PARM_DESC(ovl_check_copy_up,
|
|
||||||
"Warn on copy-up when causing process also has a R/O fd open");
|
|
||||||
|
|
||||||
static int ovl_check_fd(const void *data, struct file *f, unsigned fd)
|
|
||||||
{
|
|
||||||
const struct dentry *dentry = data;
|
|
||||||
|
|
||||||
if (f->f_path.dentry == dentry)
|
|
||||||
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
|
|
||||||
f, fd, current->pid, current->comm);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Check the fds open by this process and warn if something like the following
|
|
||||||
* scenario is about to occur:
|
|
||||||
*
|
|
||||||
* fd1 = open("foo", O_RDONLY);
|
|
||||||
* fd2 = open("foo", O_RDWR);
|
|
||||||
*/
|
|
||||||
static void ovl_do_check_copy_up(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
if (ovl_check_copy_up)
|
|
||||||
iterate_fd(current->files, 0, ovl_check_fd, dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
|
|
||||||
{
|
|
||||||
ssize_t list_size, size, value_size = 0;
|
|
||||||
char *buf, *name, *value = NULL;
|
|
||||||
int uninitialized_var(error);
|
|
||||||
|
|
||||||
if (!old->d_inode->i_op->getxattr ||
|
|
||||||
!new->d_inode->i_op->getxattr)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
list_size = vfs_listxattr(old, NULL, 0);
|
|
||||||
if (list_size <= 0) {
|
|
||||||
if (list_size == -EOPNOTSUPP)
|
|
||||||
return 0;
|
|
||||||
return list_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
buf = kzalloc(list_size, GFP_KERNEL);
|
|
||||||
if (!buf)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
list_size = vfs_listxattr(old, buf, list_size);
|
|
||||||
if (list_size <= 0) {
|
|
||||||
error = list_size;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
|
|
||||||
retry:
|
|
||||||
size = vfs_getxattr(old, name, value, value_size);
|
|
||||||
if (size == -ERANGE)
|
|
||||||
size = vfs_getxattr(old, name, NULL, 0);
|
|
||||||
|
|
||||||
if (size < 0) {
|
|
||||||
error = size;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (size > value_size) {
|
|
||||||
void *new;
|
|
||||||
|
|
||||||
new = krealloc(value, size, GFP_KERNEL);
|
|
||||||
if (!new) {
|
|
||||||
error = -ENOMEM;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
value = new;
|
|
||||||
value_size = size;
|
|
||||||
goto retry;
|
|
||||||
}
|
|
||||||
|
|
||||||
error = vfs_setxattr(new, name, value, size, 0);
|
|
||||||
if (error)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
kfree(value);
|
|
||||||
out:
|
|
||||||
kfree(buf);
|
|
||||||
return error;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
|
|
||||||
{
|
|
||||||
struct file *old_file;
|
|
||||||
struct file *new_file;
|
|
||||||
loff_t old_pos = 0;
|
|
||||||
loff_t new_pos = 0;
|
|
||||||
int error = 0;
|
|
||||||
|
|
||||||
if (len == 0)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
old_file = ovl_path_open(old, O_RDONLY);
|
|
||||||
if (IS_ERR(old_file))
|
|
||||||
return PTR_ERR(old_file);
|
|
||||||
|
|
||||||
new_file = ovl_path_open(new, O_WRONLY);
|
|
||||||
if (IS_ERR(new_file)) {
|
|
||||||
error = PTR_ERR(new_file);
|
|
||||||
goto out_fput;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* FIXME: copy up sparse files efficiently */
|
|
||||||
while (len) {
|
|
||||||
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
|
|
||||||
long bytes;
|
|
||||||
|
|
||||||
if (len < this_len)
|
|
||||||
this_len = len;
|
|
||||||
|
|
||||||
if (signal_pending_state(TASK_KILLABLE, current)) {
|
|
||||||
error = -EINTR;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
bytes = do_splice_direct(old_file, &old_pos,
|
|
||||||
new_file, &new_pos,
|
|
||||||
this_len, SPLICE_F_MOVE);
|
|
||||||
if (bytes <= 0) {
|
|
||||||
error = bytes;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
WARN_ON(old_pos != new_pos);
|
|
||||||
|
|
||||||
len -= bytes;
|
|
||||||
}
|
|
||||||
|
|
||||||
fput(new_file);
|
|
||||||
out_fput:
|
|
||||||
fput(old_file);
|
|
||||||
return error;
|
|
||||||
}
|
|
||||||
|
|
||||||
static char *ovl_read_symlink(struct dentry *realdentry)
|
|
||||||
{
|
|
||||||
int res;
|
|
||||||
char *buf;
|
|
||||||
struct inode *inode = realdentry->d_inode;
|
|
||||||
mm_segment_t old_fs;
|
|
||||||
|
|
||||||
res = -EINVAL;
|
|
||||||
if (!inode->i_op->readlink)
|
|
||||||
goto err;
|
|
||||||
|
|
||||||
res = -ENOMEM;
|
|
||||||
buf = (char *) __get_free_page(GFP_KERNEL);
|
|
||||||
if (!buf)
|
|
||||||
goto err;
|
|
||||||
|
|
||||||
old_fs = get_fs();
|
|
||||||
set_fs(get_ds());
|
|
||||||
/* The cast to a user pointer is valid due to the set_fs() */
|
|
||||||
res = inode->i_op->readlink(realdentry,
|
|
||||||
(char __user *)buf, PAGE_SIZE - 1);
|
|
||||||
set_fs(old_fs);
|
|
||||||
if (res < 0) {
|
|
||||||
free_page((unsigned long) buf);
|
|
||||||
goto err;
|
|
||||||
}
|
|
||||||
buf[res] = '\0';
|
|
||||||
|
|
||||||
return buf;
|
|
||||||
|
|
||||||
err:
|
|
||||||
return ERR_PTR(res);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
|
|
||||||
{
|
|
||||||
struct iattr attr = {
|
|
||||||
.ia_valid =
|
|
||||||
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
|
|
||||||
.ia_atime = stat->atime,
|
|
||||||
.ia_mtime = stat->mtime,
|
|
||||||
};
|
|
||||||
|
|
||||||
return notify_change(upperdentry, &attr, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
|
|
||||||
{
|
|
||||||
int err = 0;
|
|
||||||
|
|
||||||
if (!S_ISLNK(stat->mode)) {
|
|
||||||
struct iattr attr = {
|
|
||||||
.ia_valid = ATTR_MODE,
|
|
||||||
.ia_mode = stat->mode,
|
|
||||||
};
|
|
||||||
err = notify_change(upperdentry, &attr, NULL);
|
|
||||||
}
|
|
||||||
if (!err) {
|
|
||||||
struct iattr attr = {
|
|
||||||
.ia_valid = ATTR_UID | ATTR_GID,
|
|
||||||
.ia_uid = stat->uid,
|
|
||||||
.ia_gid = stat->gid,
|
|
||||||
};
|
|
||||||
err = notify_change(upperdentry, &attr, NULL);
|
|
||||||
}
|
|
||||||
if (!err)
|
|
||||||
ovl_set_timestamps(upperdentry, stat);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
|
|
||||||
struct dentry *dentry, struct path *lowerpath,
|
|
||||||
struct kstat *stat, struct iattr *attr,
|
|
||||||
const char *link)
|
|
||||||
{
|
|
||||||
struct inode *wdir = workdir->d_inode;
|
|
||||||
struct inode *udir = upperdir->d_inode;
|
|
||||||
struct dentry *newdentry = NULL;
|
|
||||||
struct dentry *upper = NULL;
|
|
||||||
umode_t mode = stat->mode;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
|
||||||
err = PTR_ERR(newdentry);
|
|
||||||
if (IS_ERR(newdentry))
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
|
||||||
dentry->d_name.len);
|
|
||||||
err = PTR_ERR(upper);
|
|
||||||
if (IS_ERR(upper))
|
|
||||||
goto out1;
|
|
||||||
|
|
||||||
/* Can't properly set mode on creation because of the umask */
|
|
||||||
stat->mode &= S_IFMT;
|
|
||||||
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
|
|
||||||
stat->mode = mode;
|
|
||||||
if (err)
|
|
||||||
goto out2;
|
|
||||||
|
|
||||||
if (S_ISREG(stat->mode)) {
|
|
||||||
struct path upperpath;
|
|
||||||
ovl_path_upper(dentry, &upperpath);
|
|
||||||
BUG_ON(upperpath.dentry != NULL);
|
|
||||||
upperpath.dentry = newdentry;
|
|
||||||
|
|
||||||
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
mutex_lock(&newdentry->d_inode->i_mutex);
|
|
||||||
err = ovl_set_attr(newdentry, stat);
|
|
||||||
if (!err && attr)
|
|
||||||
err = notify_change(newdentry, attr, NULL);
|
|
||||||
mutex_unlock(&newdentry->d_inode->i_mutex);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
ovl_dentry_update(dentry, newdentry);
|
|
||||||
newdentry = NULL;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Non-directores become opaque when copied up.
|
|
||||||
*/
|
|
||||||
if (!S_ISDIR(stat->mode))
|
|
||||||
ovl_dentry_set_opaque(dentry, true);
|
|
||||||
out2:
|
|
||||||
dput(upper);
|
|
||||||
out1:
|
|
||||||
dput(newdentry);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
|
|
||||||
out_cleanup:
|
|
||||||
ovl_cleanup(wdir, newdentry);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Copy up a single dentry
|
|
||||||
*
|
|
||||||
* Directory renames only allowed on "pure upper" (already created on
|
|
||||||
* upper filesystem, never copied up). Directories which are on lower or
|
|
||||||
* are merged may not be renamed. For these -EXDEV is returned and
|
|
||||||
* userspace has to deal with it. This means, when copying up a
|
|
||||||
* directory we can rely on it and ancestors being stable.
|
|
||||||
*
|
|
||||||
* Non-directory renames start with copy up of source if necessary. The
|
|
||||||
* actual rename will only proceed once the copy up was successful. Copy
|
|
||||||
* up uses upper parent i_mutex for exclusion. Since rename can change
|
|
||||||
* d_parent it is possible that the copy up will lock the old parent. At
|
|
||||||
* that point the file will have already been copied up anyway.
|
|
||||||
*/
|
|
||||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
|
||||||
struct path *lowerpath, struct kstat *stat,
|
|
||||||
struct iattr *attr)
|
|
||||||
{
|
|
||||||
struct dentry *workdir = ovl_workdir(dentry);
|
|
||||||
int err;
|
|
||||||
struct kstat pstat;
|
|
||||||
struct path parentpath;
|
|
||||||
struct dentry *upperdir;
|
|
||||||
struct dentry *upperdentry;
|
|
||||||
const struct cred *old_cred;
|
|
||||||
struct cred *override_cred;
|
|
||||||
char *link = NULL;
|
|
||||||
|
|
||||||
if (WARN_ON(!workdir))
|
|
||||||
return -EROFS;
|
|
||||||
|
|
||||||
ovl_do_check_copy_up(lowerpath->dentry);
|
|
||||||
|
|
||||||
ovl_path_upper(parent, &parentpath);
|
|
||||||
upperdir = parentpath.dentry;
|
|
||||||
|
|
||||||
err = vfs_getattr(&parentpath, &pstat);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
if (S_ISLNK(stat->mode)) {
|
|
||||||
link = ovl_read_symlink(lowerpath->dentry);
|
|
||||||
if (IS_ERR(link))
|
|
||||||
return PTR_ERR(link);
|
|
||||||
}
|
|
||||||
|
|
||||||
err = -ENOMEM;
|
|
||||||
override_cred = prepare_creds();
|
|
||||||
if (!override_cred)
|
|
||||||
goto out_free_link;
|
|
||||||
|
|
||||||
override_cred->fsuid = stat->uid;
|
|
||||||
override_cred->fsgid = stat->gid;
|
|
||||||
/*
|
|
||||||
* CAP_SYS_ADMIN for copying up extended attributes
|
|
||||||
* CAP_DAC_OVERRIDE for create
|
|
||||||
* CAP_FOWNER for chmod, timestamp update
|
|
||||||
* CAP_FSETID for chmod
|
|
||||||
* CAP_CHOWN for chown
|
|
||||||
* CAP_MKNOD for mknod
|
|
||||||
*/
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_MKNOD);
|
|
||||||
old_cred = override_creds(override_cred);
|
|
||||||
|
|
||||||
err = -EIO;
|
|
||||||
if (lock_rename(workdir, upperdir) != NULL) {
|
|
||||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
|
||||||
goto out_unlock;
|
|
||||||
}
|
|
||||||
upperdentry = ovl_dentry_upper(dentry);
|
|
||||||
if (upperdentry) {
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
err = 0;
|
|
||||||
/* Raced with another copy-up? Do the setattr here */
|
|
||||||
if (attr) {
|
|
||||||
mutex_lock(&upperdentry->d_inode->i_mutex);
|
|
||||||
err = notify_change(upperdentry, attr, NULL);
|
|
||||||
mutex_unlock(&upperdentry->d_inode->i_mutex);
|
|
||||||
}
|
|
||||||
goto out_put_cred;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
|
|
||||||
stat, attr, link);
|
|
||||||
if (!err) {
|
|
||||||
/* Restore timestamps on parent (best effort) */
|
|
||||||
ovl_set_timestamps(upperdir, &pstat);
|
|
||||||
}
|
|
||||||
out_unlock:
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
out_put_cred:
|
|
||||||
revert_creds(old_cred);
|
|
||||||
put_cred(override_cred);
|
|
||||||
|
|
||||||
out_free_link:
|
|
||||||
if (link)
|
|
||||||
free_page((unsigned long) link);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_copy_up(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
err = 0;
|
|
||||||
while (!err) {
|
|
||||||
struct dentry *next;
|
|
||||||
struct dentry *parent;
|
|
||||||
struct path lowerpath;
|
|
||||||
struct kstat stat;
|
|
||||||
enum ovl_path_type type = ovl_path_type(dentry);
|
|
||||||
|
|
||||||
if (OVL_TYPE_UPPER(type))
|
|
||||||
break;
|
|
||||||
|
|
||||||
next = dget(dentry);
|
|
||||||
/* find the topmost dentry not yet copied up */
|
|
||||||
for (;;) {
|
|
||||||
parent = dget_parent(next);
|
|
||||||
|
|
||||||
type = ovl_path_type(parent);
|
|
||||||
if (OVL_TYPE_UPPER(type))
|
|
||||||
break;
|
|
||||||
|
|
||||||
dput(next);
|
|
||||||
next = parent;
|
|
||||||
}
|
|
||||||
|
|
||||||
ovl_path_lower(next, &lowerpath);
|
|
||||||
err = vfs_getattr(&lowerpath, &stat);
|
|
||||||
if (!err)
|
|
||||||
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
|
|
||||||
|
|
||||||
dput(parent);
|
|
||||||
dput(next);
|
|
||||||
}
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
@@ -1,972 +0,0 @@
|
|||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/fs.h>
|
|
||||||
#include <linux/namei.h>
|
|
||||||
#include <linux/xattr.h>
|
|
||||||
#include <linux/security.h>
|
|
||||||
#include <linux/cred.h>
|
|
||||||
#include "overlayfs.h"
|
|
||||||
|
|
||||||
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
dget(wdentry);
|
|
||||||
if (S_ISDIR(wdentry->d_inode->i_mode))
|
|
||||||
err = ovl_do_rmdir(wdir, wdentry);
|
|
||||||
else
|
|
||||||
err = ovl_do_unlink(wdir, wdentry);
|
|
||||||
dput(wdentry);
|
|
||||||
|
|
||||||
if (err) {
|
|
||||||
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
|
|
||||||
wdentry, err);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct dentry *temp;
|
|
||||||
char name[20];
|
|
||||||
|
|
||||||
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
|
|
||||||
|
|
||||||
temp = lookup_one_len(name, workdir, strlen(name));
|
|
||||||
if (!IS_ERR(temp) && temp->d_inode) {
|
|
||||||
pr_err("overlayfs: workdir/%s already exists\n", name);
|
|
||||||
dput(temp);
|
|
||||||
temp = ERR_PTR(-EIO);
|
|
||||||
}
|
|
||||||
|
|
||||||
return temp;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* caller holds i_mutex on workdir */
|
|
||||||
static struct dentry *ovl_whiteout(struct dentry *workdir,
|
|
||||||
struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *whiteout;
|
|
||||||
struct inode *wdir = workdir->d_inode;
|
|
||||||
|
|
||||||
whiteout = ovl_lookup_temp(workdir, dentry);
|
|
||||||
if (IS_ERR(whiteout))
|
|
||||||
return whiteout;
|
|
||||||
|
|
||||||
err = ovl_do_whiteout(wdir, whiteout);
|
|
||||||
if (err) {
|
|
||||||
dput(whiteout);
|
|
||||||
whiteout = ERR_PTR(err);
|
|
||||||
}
|
|
||||||
|
|
||||||
return whiteout;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
|
||||||
struct kstat *stat, const char *link,
|
|
||||||
struct dentry *hardlink, bool debug)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (newdentry->d_inode)
|
|
||||||
return -ESTALE;
|
|
||||||
|
|
||||||
if (hardlink) {
|
|
||||||
err = ovl_do_link(hardlink, dir, newdentry, debug);
|
|
||||||
} else {
|
|
||||||
switch (stat->mode & S_IFMT) {
|
|
||||||
case S_IFREG:
|
|
||||||
err = ovl_do_create(dir, newdentry, stat->mode, debug);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case S_IFDIR:
|
|
||||||
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case S_IFCHR:
|
|
||||||
case S_IFBLK:
|
|
||||||
case S_IFIFO:
|
|
||||||
case S_IFSOCK:
|
|
||||||
err = ovl_do_mknod(dir, newdentry,
|
|
||||||
stat->mode, stat->rdev, debug);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case S_IFLNK:
|
|
||||||
err = ovl_do_symlink(dir, newdentry, link, debug);
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
err = -EPERM;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!err && WARN_ON(!newdentry->d_inode)) {
|
|
||||||
/*
|
|
||||||
* Not quite sure if non-instantiated dentry is legal or not.
|
|
||||||
* VFS doesn't seem to care so check and warn here.
|
|
||||||
*/
|
|
||||||
err = -ENOENT;
|
|
||||||
}
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_set_opaque(struct dentry *upperdentry)
|
|
||||||
{
|
|
||||||
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ovl_remove_opaque(struct dentry *upperdentry)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
|
|
||||||
if (err) {
|
|
||||||
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
|
|
||||||
upperdentry->d_name.name, err);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
|
||||||
struct kstat *stat)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
enum ovl_path_type type;
|
|
||||||
struct path realpath;
|
|
||||||
|
|
||||||
type = ovl_path_real(dentry, &realpath);
|
|
||||||
err = vfs_getattr(&realpath, stat);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
stat->dev = dentry->d_sb->s_dev;
|
|
||||||
stat->ino = dentry->d_inode->i_ino;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* It's probably not worth it to count subdirs to get the
|
|
||||||
* correct link count. nlink=1 seems to pacify 'find' and
|
|
||||||
* other utilities.
|
|
||||||
*/
|
|
||||||
if (OVL_TYPE_MERGE(type))
|
|
||||||
stat->nlink = 1;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
|
|
||||||
struct kstat *stat, const char *link,
|
|
||||||
struct dentry *hardlink)
|
|
||||||
{
|
|
||||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
|
||||||
struct inode *udir = upperdir->d_inode;
|
|
||||||
struct dentry *newdentry;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
|
|
||||||
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
|
|
||||||
dentry->d_name.len);
|
|
||||||
err = PTR_ERR(newdentry);
|
|
||||||
if (IS_ERR(newdentry))
|
|
||||||
goto out_unlock;
|
|
||||||
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
|
|
||||||
if (err)
|
|
||||||
goto out_dput;
|
|
||||||
|
|
||||||
ovl_dentry_version_inc(dentry->d_parent);
|
|
||||||
ovl_dentry_update(dentry, newdentry);
|
|
||||||
ovl_copyattr(newdentry->d_inode, inode);
|
|
||||||
d_instantiate(dentry, inode);
|
|
||||||
newdentry = NULL;
|
|
||||||
out_dput:
|
|
||||||
dput(newdentry);
|
|
||||||
out_unlock:
|
|
||||||
mutex_unlock(&udir->i_mutex);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_lock_rename_workdir(struct dentry *workdir,
|
|
||||||
struct dentry *upperdir)
|
|
||||||
{
|
|
||||||
/* Workdir should not be the same as upperdir */
|
|
||||||
if (workdir == upperdir)
|
|
||||||
goto err;
|
|
||||||
|
|
||||||
/* Workdir should not be subdir of upperdir and vice versa */
|
|
||||||
if (lock_rename(workdir, upperdir) != NULL)
|
|
||||||
goto err_unlock;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
err_unlock:
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
err:
|
|
||||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
|
||||||
return -EIO;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct dentry *ovl_clear_empty(struct dentry *dentry,
|
|
||||||
struct list_head *list)
|
|
||||||
{
|
|
||||||
struct dentry *workdir = ovl_workdir(dentry);
|
|
||||||
struct inode *wdir = workdir->d_inode;
|
|
||||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
|
||||||
struct inode *udir = upperdir->d_inode;
|
|
||||||
struct path upperpath;
|
|
||||||
struct dentry *upper;
|
|
||||||
struct dentry *opaquedir;
|
|
||||||
struct kstat stat;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (WARN_ON(!workdir))
|
|
||||||
return ERR_PTR(-EROFS);
|
|
||||||
|
|
||||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
ovl_path_upper(dentry, &upperpath);
|
|
||||||
err = vfs_getattr(&upperpath, &stat);
|
|
||||||
if (err)
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
err = -ESTALE;
|
|
||||||
if (!S_ISDIR(stat.mode))
|
|
||||||
goto out_unlock;
|
|
||||||
upper = upperpath.dentry;
|
|
||||||
if (upper->d_parent->d_inode != udir)
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
opaquedir = ovl_lookup_temp(workdir, dentry);
|
|
||||||
err = PTR_ERR(opaquedir);
|
|
||||||
if (IS_ERR(opaquedir))
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
|
|
||||||
if (err)
|
|
||||||
goto out_dput;
|
|
||||||
|
|
||||||
err = ovl_copy_xattr(upper, opaquedir);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
err = ovl_set_opaque(opaquedir);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
mutex_lock(&opaquedir->d_inode->i_mutex);
|
|
||||||
err = ovl_set_attr(opaquedir, &stat);
|
|
||||||
mutex_unlock(&opaquedir->d_inode->i_mutex);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
ovl_cleanup_whiteouts(upper, list);
|
|
||||||
ovl_cleanup(wdir, upper);
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
|
|
||||||
/* dentry's upper doesn't match now, get rid of it */
|
|
||||||
d_drop(dentry);
|
|
||||||
|
|
||||||
return opaquedir;
|
|
||||||
|
|
||||||
out_cleanup:
|
|
||||||
ovl_cleanup(wdir, opaquedir);
|
|
||||||
out_dput:
|
|
||||||
dput(opaquedir);
|
|
||||||
out_unlock:
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
out:
|
|
||||||
return ERR_PTR(err);
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *ret = NULL;
|
|
||||||
LIST_HEAD(list);
|
|
||||||
|
|
||||||
err = ovl_check_empty_dir(dentry, &list);
|
|
||||||
if (err)
|
|
||||||
ret = ERR_PTR(err);
|
|
||||||
else {
|
|
||||||
/*
|
|
||||||
* If no upperdentry then skip clearing whiteouts.
|
|
||||||
*
|
|
||||||
* Can race with copy-up, since we don't hold the upperdir
|
|
||||||
* mutex. Doesn't matter, since copy-up can't create a
|
|
||||||
* non-empty directory from an empty one.
|
|
||||||
*/
|
|
||||||
if (ovl_dentry_upper(dentry))
|
|
||||||
ret = ovl_clear_empty(dentry, &list);
|
|
||||||
}
|
|
||||||
|
|
||||||
ovl_cache_free(&list);
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
|
|
||||||
struct kstat *stat, const char *link,
|
|
||||||
struct dentry *hardlink)
|
|
||||||
{
|
|
||||||
struct dentry *workdir = ovl_workdir(dentry);
|
|
||||||
struct inode *wdir = workdir->d_inode;
|
|
||||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
|
||||||
struct inode *udir = upperdir->d_inode;
|
|
||||||
struct dentry *upper;
|
|
||||||
struct dentry *newdentry;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (WARN_ON(!workdir))
|
|
||||||
return -EROFS;
|
|
||||||
|
|
||||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
|
||||||
err = PTR_ERR(newdentry);
|
|
||||||
if (IS_ERR(newdentry))
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
|
||||||
dentry->d_name.len);
|
|
||||||
err = PTR_ERR(upper);
|
|
||||||
if (IS_ERR(upper))
|
|
||||||
goto out_dput;
|
|
||||||
|
|
||||||
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
|
|
||||||
if (err)
|
|
||||||
goto out_dput2;
|
|
||||||
|
|
||||||
if (S_ISDIR(stat->mode)) {
|
|
||||||
err = ovl_set_opaque(newdentry);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
err = ovl_do_rename(wdir, newdentry, udir, upper,
|
|
||||||
RENAME_EXCHANGE);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
ovl_cleanup(wdir, upper);
|
|
||||||
} else {
|
|
||||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
}
|
|
||||||
ovl_dentry_version_inc(dentry->d_parent);
|
|
||||||
ovl_dentry_update(dentry, newdentry);
|
|
||||||
ovl_copyattr(newdentry->d_inode, inode);
|
|
||||||
d_instantiate(dentry, inode);
|
|
||||||
newdentry = NULL;
|
|
||||||
out_dput2:
|
|
||||||
dput(upper);
|
|
||||||
out_dput:
|
|
||||||
dput(newdentry);
|
|
||||||
out_unlock:
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
|
|
||||||
out_cleanup:
|
|
||||||
ovl_cleanup(wdir, newdentry);
|
|
||||||
goto out_dput2;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
|
|
||||||
const char *link, struct dentry *hardlink)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct inode *inode;
|
|
||||||
struct kstat stat = {
|
|
||||||
.mode = mode,
|
|
||||||
.rdev = rdev,
|
|
||||||
};
|
|
||||||
|
|
||||||
err = -ENOMEM;
|
|
||||||
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
|
|
||||||
if (!inode)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = ovl_copy_up(dentry->d_parent);
|
|
||||||
if (err)
|
|
||||||
goto out_iput;
|
|
||||||
|
|
||||||
if (!ovl_dentry_is_opaque(dentry)) {
|
|
||||||
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
|
|
||||||
} else {
|
|
||||||
const struct cred *old_cred;
|
|
||||||
struct cred *override_cred;
|
|
||||||
|
|
||||||
err = -ENOMEM;
|
|
||||||
override_cred = prepare_creds();
|
|
||||||
if (!override_cred)
|
|
||||||
goto out_iput;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* CAP_SYS_ADMIN for setting opaque xattr
|
|
||||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
|
||||||
* CAP_FOWNER for removing whiteout from sticky dir
|
|
||||||
*/
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
|
||||||
old_cred = override_creds(override_cred);
|
|
||||||
|
|
||||||
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
|
|
||||||
hardlink);
|
|
||||||
|
|
||||||
revert_creds(old_cred);
|
|
||||||
put_cred(override_cred);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!err)
|
|
||||||
inode = NULL;
|
|
||||||
out_iput:
|
|
||||||
iput(inode);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
|
|
||||||
const char *link)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (!err) {
|
|
||||||
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
|
|
||||||
bool excl)
|
|
||||||
{
|
|
||||||
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
|
||||||
{
|
|
||||||
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
|
|
||||||
dev_t rdev)
|
|
||||||
{
|
|
||||||
/* Don't allow creation of "whiteout" on overlay */
|
|
||||||
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
|
|
||||||
return -EPERM;
|
|
||||||
|
|
||||||
return ovl_create_object(dentry, mode, rdev, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
|
|
||||||
const char *link)
|
|
||||||
{
|
|
||||||
return ovl_create_object(dentry, S_IFLNK, 0, link);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_link(struct dentry *old, struct inode *newdir,
|
|
||||||
struct dentry *new)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *upper;
|
|
||||||
|
|
||||||
err = ovl_want_write(old);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = ovl_copy_up(old);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
upper = ovl_dentry_upper(old);
|
|
||||||
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
|
|
||||||
|
|
||||||
out_drop_write:
|
|
||||||
ovl_drop_write(old);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
|
|
||||||
{
|
|
||||||
struct dentry *workdir = ovl_workdir(dentry);
|
|
||||||
struct inode *wdir = workdir->d_inode;
|
|
||||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
|
||||||
struct inode *udir = upperdir->d_inode;
|
|
||||||
struct dentry *whiteout;
|
|
||||||
struct dentry *upper;
|
|
||||||
struct dentry *opaquedir = NULL;
|
|
||||||
int err;
|
|
||||||
int flags = 0;
|
|
||||||
|
|
||||||
if (WARN_ON(!workdir))
|
|
||||||
return -EROFS;
|
|
||||||
|
|
||||||
if (is_dir) {
|
|
||||||
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
|
|
||||||
opaquedir = ovl_check_empty_and_clear(dentry);
|
|
||||||
err = PTR_ERR(opaquedir);
|
|
||||||
if (IS_ERR(opaquedir))
|
|
||||||
goto out;
|
|
||||||
} else {
|
|
||||||
LIST_HEAD(list);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* When removing an empty opaque directory, then it
|
|
||||||
* makes no sense to replace it with an exact replica of
|
|
||||||
* itself. But emptiness still needs to be checked.
|
|
||||||
*/
|
|
||||||
err = ovl_check_empty_dir(dentry, &list);
|
|
||||||
ovl_cache_free(&list);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
|
||||||
if (err)
|
|
||||||
goto out_dput;
|
|
||||||
|
|
||||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
|
||||||
dentry->d_name.len);
|
|
||||||
err = PTR_ERR(upper);
|
|
||||||
if (IS_ERR(upper))
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
err = -ESTALE;
|
|
||||||
if ((opaquedir && upper != opaquedir) ||
|
|
||||||
(!opaquedir && ovl_dentry_upper(dentry) &&
|
|
||||||
upper != ovl_dentry_upper(dentry))) {
|
|
||||||
goto out_dput_upper;
|
|
||||||
}
|
|
||||||
|
|
||||||
whiteout = ovl_whiteout(workdir, dentry);
|
|
||||||
err = PTR_ERR(whiteout);
|
|
||||||
if (IS_ERR(whiteout))
|
|
||||||
goto out_dput_upper;
|
|
||||||
|
|
||||||
if (d_is_dir(upper))
|
|
||||||
flags = RENAME_EXCHANGE;
|
|
||||||
|
|
||||||
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
|
|
||||||
if (err)
|
|
||||||
goto kill_whiteout;
|
|
||||||
if (flags)
|
|
||||||
ovl_cleanup(wdir, upper);
|
|
||||||
|
|
||||||
ovl_dentry_version_inc(dentry->d_parent);
|
|
||||||
out_d_drop:
|
|
||||||
d_drop(dentry);
|
|
||||||
dput(whiteout);
|
|
||||||
out_dput_upper:
|
|
||||||
dput(upper);
|
|
||||||
out_unlock:
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
out_dput:
|
|
||||||
dput(opaquedir);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
|
|
||||||
kill_whiteout:
|
|
||||||
ovl_cleanup(wdir, whiteout);
|
|
||||||
goto out_d_drop;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
|
|
||||||
{
|
|
||||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
|
||||||
struct inode *dir = upperdir->d_inode;
|
|
||||||
struct dentry *upper;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
|
|
||||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
|
||||||
dentry->d_name.len);
|
|
||||||
err = PTR_ERR(upper);
|
|
||||||
if (IS_ERR(upper))
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
err = -ESTALE;
|
|
||||||
if (upper == ovl_dentry_upper(dentry)) {
|
|
||||||
if (is_dir)
|
|
||||||
err = vfs_rmdir(dir, upper);
|
|
||||||
else
|
|
||||||
err = vfs_unlink(dir, upper, NULL);
|
|
||||||
ovl_dentry_version_inc(dentry->d_parent);
|
|
||||||
}
|
|
||||||
dput(upper);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Keeping this dentry hashed would mean having to release
|
|
||||||
* upperpath/lowerpath, which could only be done if we are the
|
|
||||||
* sole user of this dentry. Too tricky... Just unhash for
|
|
||||||
* now.
|
|
||||||
*/
|
|
||||||
if (!err)
|
|
||||||
d_drop(dentry);
|
|
||||||
out_unlock:
|
|
||||||
mutex_unlock(&dir->i_mutex);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_check_sticky(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
|
|
||||||
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
|
|
||||||
|
|
||||||
if (check_sticky(dir, inode))
|
|
||||||
return -EPERM;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
|
|
||||||
{
|
|
||||||
enum ovl_path_type type;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
err = ovl_check_sticky(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = ovl_copy_up(dentry->d_parent);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
type = ovl_path_type(dentry);
|
|
||||||
if (OVL_TYPE_PURE_UPPER(type)) {
|
|
||||||
err = ovl_remove_upper(dentry, is_dir);
|
|
||||||
} else {
|
|
||||||
const struct cred *old_cred;
|
|
||||||
struct cred *override_cred;
|
|
||||||
|
|
||||||
err = -ENOMEM;
|
|
||||||
override_cred = prepare_creds();
|
|
||||||
if (!override_cred)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
|
||||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
|
||||||
* CAP_FOWNER for removing whiteout from sticky dir
|
|
||||||
* CAP_FSETID for chmod of opaque dir
|
|
||||||
* CAP_CHOWN for chown of opaque dir
|
|
||||||
*/
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
|
||||||
old_cred = override_creds(override_cred);
|
|
||||||
|
|
||||||
err = ovl_remove_and_whiteout(dentry, is_dir);
|
|
||||||
|
|
||||||
revert_creds(old_cred);
|
|
||||||
put_cred(override_cred);
|
|
||||||
}
|
|
||||||
out_drop_write:
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return ovl_do_remove(dentry, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return ovl_do_remove(dentry, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_rename2(struct inode *olddir, struct dentry *old,
|
|
||||||
struct inode *newdir, struct dentry *new,
|
|
||||||
unsigned int flags)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
enum ovl_path_type old_type;
|
|
||||||
enum ovl_path_type new_type;
|
|
||||||
struct dentry *old_upperdir;
|
|
||||||
struct dentry *new_upperdir;
|
|
||||||
struct dentry *olddentry;
|
|
||||||
struct dentry *newdentry;
|
|
||||||
struct dentry *trap;
|
|
||||||
bool old_opaque;
|
|
||||||
bool new_opaque;
|
|
||||||
bool new_create = false;
|
|
||||||
bool cleanup_whiteout = false;
|
|
||||||
bool overwrite = !(flags & RENAME_EXCHANGE);
|
|
||||||
bool is_dir = S_ISDIR(old->d_inode->i_mode);
|
|
||||||
bool new_is_dir = false;
|
|
||||||
struct dentry *opaquedir = NULL;
|
|
||||||
const struct cred *old_cred = NULL;
|
|
||||||
struct cred *override_cred = NULL;
|
|
||||||
|
|
||||||
err = -EINVAL;
|
|
||||||
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
flags &= ~RENAME_NOREPLACE;
|
|
||||||
|
|
||||||
err = ovl_check_sticky(old);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
/* Don't copy up directory trees */
|
|
||||||
old_type = ovl_path_type(old);
|
|
||||||
err = -EXDEV;
|
|
||||||
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
if (new->d_inode) {
|
|
||||||
err = ovl_check_sticky(new);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
if (S_ISDIR(new->d_inode->i_mode))
|
|
||||||
new_is_dir = true;
|
|
||||||
|
|
||||||
new_type = ovl_path_type(new);
|
|
||||||
err = -EXDEV;
|
|
||||||
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = 0;
|
|
||||||
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
|
|
||||||
if (ovl_dentry_lower(old)->d_inode ==
|
|
||||||
ovl_dentry_lower(new)->d_inode)
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
|
|
||||||
if (ovl_dentry_upper(old)->d_inode ==
|
|
||||||
ovl_dentry_upper(new)->d_inode)
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (ovl_dentry_is_opaque(new))
|
|
||||||
new_type = __OVL_PATH_UPPER;
|
|
||||||
else
|
|
||||||
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ovl_want_write(old);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = ovl_copy_up(old);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
err = ovl_copy_up(new->d_parent);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
if (!overwrite) {
|
|
||||||
err = ovl_copy_up(new);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
}
|
|
||||||
|
|
||||||
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
|
|
||||||
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
|
|
||||||
|
|
||||||
if (old_opaque || new_opaque) {
|
|
||||||
err = -ENOMEM;
|
|
||||||
override_cred = prepare_creds();
|
|
||||||
if (!override_cred)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
|
||||||
* CAP_DAC_OVERRIDE for create in workdir
|
|
||||||
* CAP_FOWNER for removing whiteout from sticky dir
|
|
||||||
* CAP_FSETID for chmod of opaque dir
|
|
||||||
* CAP_CHOWN for chown of opaque dir
|
|
||||||
*/
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
|
||||||
old_cred = override_creds(override_cred);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
|
|
||||||
opaquedir = ovl_check_empty_and_clear(new);
|
|
||||||
err = PTR_ERR(opaquedir);
|
|
||||||
if (IS_ERR(opaquedir)) {
|
|
||||||
opaquedir = NULL;
|
|
||||||
goto out_revert_creds;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (overwrite) {
|
|
||||||
if (old_opaque) {
|
|
||||||
if (new->d_inode || !new_opaque) {
|
|
||||||
/* Whiteout source */
|
|
||||||
flags |= RENAME_WHITEOUT;
|
|
||||||
} else {
|
|
||||||
/* Switch whiteouts */
|
|
||||||
flags |= RENAME_EXCHANGE;
|
|
||||||
}
|
|
||||||
} else if (is_dir && !new->d_inode && new_opaque) {
|
|
||||||
flags |= RENAME_EXCHANGE;
|
|
||||||
cleanup_whiteout = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
old_upperdir = ovl_dentry_upper(old->d_parent);
|
|
||||||
new_upperdir = ovl_dentry_upper(new->d_parent);
|
|
||||||
|
|
||||||
trap = lock_rename(new_upperdir, old_upperdir);
|
|
||||||
|
|
||||||
|
|
||||||
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
|
|
||||||
old->d_name.len);
|
|
||||||
err = PTR_ERR(olddentry);
|
|
||||||
if (IS_ERR(olddentry))
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
err = -ESTALE;
|
|
||||||
if (olddentry != ovl_dentry_upper(old))
|
|
||||||
goto out_dput_old;
|
|
||||||
|
|
||||||
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
|
|
||||||
new->d_name.len);
|
|
||||||
err = PTR_ERR(newdentry);
|
|
||||||
if (IS_ERR(newdentry))
|
|
||||||
goto out_dput_old;
|
|
||||||
|
|
||||||
err = -ESTALE;
|
|
||||||
if (ovl_dentry_upper(new)) {
|
|
||||||
if (opaquedir) {
|
|
||||||
if (newdentry != opaquedir)
|
|
||||||
goto out_dput;
|
|
||||||
} else {
|
|
||||||
if (newdentry != ovl_dentry_upper(new))
|
|
||||||
goto out_dput;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
new_create = true;
|
|
||||||
if (!d_is_negative(newdentry) &&
|
|
||||||
(!new_opaque || !ovl_is_whiteout(newdentry)))
|
|
||||||
goto out_dput;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (olddentry == trap)
|
|
||||||
goto out_dput;
|
|
||||||
if (newdentry == trap)
|
|
||||||
goto out_dput;
|
|
||||||
|
|
||||||
if (is_dir && !old_opaque && new_opaque) {
|
|
||||||
err = ovl_set_opaque(olddentry);
|
|
||||||
if (err)
|
|
||||||
goto out_dput;
|
|
||||||
}
|
|
||||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
|
|
||||||
err = ovl_set_opaque(newdentry);
|
|
||||||
if (err)
|
|
||||||
goto out_dput;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (old_opaque || new_opaque) {
|
|
||||||
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
|
|
||||||
new_upperdir->d_inode, newdentry,
|
|
||||||
flags);
|
|
||||||
} else {
|
|
||||||
/* No debug for the plain case */
|
|
||||||
BUG_ON(flags & ~RENAME_EXCHANGE);
|
|
||||||
err = vfs_rename(old_upperdir->d_inode, olddentry,
|
|
||||||
new_upperdir->d_inode, newdentry,
|
|
||||||
NULL, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (err) {
|
|
||||||
if (is_dir && !old_opaque && new_opaque)
|
|
||||||
ovl_remove_opaque(olddentry);
|
|
||||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
|
|
||||||
ovl_remove_opaque(newdentry);
|
|
||||||
goto out_dput;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (is_dir && old_opaque && !new_opaque)
|
|
||||||
ovl_remove_opaque(olddentry);
|
|
||||||
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
|
|
||||||
ovl_remove_opaque(newdentry);
|
|
||||||
|
|
||||||
if (old_opaque != new_opaque) {
|
|
||||||
ovl_dentry_set_opaque(old, new_opaque);
|
|
||||||
if (!overwrite)
|
|
||||||
ovl_dentry_set_opaque(new, old_opaque);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cleanup_whiteout)
|
|
||||||
ovl_cleanup(old_upperdir->d_inode, newdentry);
|
|
||||||
|
|
||||||
ovl_dentry_version_inc(old->d_parent);
|
|
||||||
ovl_dentry_version_inc(new->d_parent);
|
|
||||||
|
|
||||||
out_dput:
|
|
||||||
dput(newdentry);
|
|
||||||
out_dput_old:
|
|
||||||
dput(olddentry);
|
|
||||||
out_unlock:
|
|
||||||
unlock_rename(new_upperdir, old_upperdir);
|
|
||||||
out_revert_creds:
|
|
||||||
if (old_opaque || new_opaque) {
|
|
||||||
revert_creds(old_cred);
|
|
||||||
put_cred(override_cred);
|
|
||||||
}
|
|
||||||
out_drop_write:
|
|
||||||
ovl_drop_write(old);
|
|
||||||
out:
|
|
||||||
dput(opaquedir);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_rename(struct inode *olddir, struct dentry *old,
|
|
||||||
struct inode *newdir, struct dentry *new)
|
|
||||||
{
|
|
||||||
return ovl_rename2(olddir, old, newdir, new, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
const struct inode_operations_wrapper ovl_dir_inode_operations = {
|
|
||||||
.ops = {
|
|
||||||
.lookup = ovl_lookup,
|
|
||||||
.mkdir = ovl_mkdir,
|
|
||||||
.symlink = ovl_symlink,
|
|
||||||
.unlink = ovl_unlink,
|
|
||||||
.rmdir = ovl_rmdir,
|
|
||||||
.rename = ovl_rename,
|
|
||||||
.link = ovl_link,
|
|
||||||
.setattr = ovl_setattr,
|
|
||||||
.create = ovl_create,
|
|
||||||
.mknod = ovl_mknod,
|
|
||||||
.permission = ovl_permission,
|
|
||||||
.getattr = ovl_dir_getattr,
|
|
||||||
.setxattr = ovl_setxattr,
|
|
||||||
.getxattr = ovl_getxattr,
|
|
||||||
.listxattr = ovl_listxattr,
|
|
||||||
.removexattr = ovl_removexattr,
|
|
||||||
},
|
|
||||||
.rename2 = ovl_rename2,
|
|
||||||
};
|
|
||||||
@@ -1,442 +0,0 @@
|
|||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/fs.h>
|
|
||||||
#include <linux/slab.h>
|
|
||||||
#include <linux/xattr.h>
|
|
||||||
#include "overlayfs.h"
|
|
||||||
|
|
||||||
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
|
|
||||||
bool no_data)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *parent;
|
|
||||||
struct kstat stat;
|
|
||||||
struct path lowerpath;
|
|
||||||
|
|
||||||
parent = dget_parent(dentry);
|
|
||||||
err = ovl_copy_up(parent);
|
|
||||||
if (err)
|
|
||||||
goto out_dput_parent;
|
|
||||||
|
|
||||||
ovl_path_lower(dentry, &lowerpath);
|
|
||||||
err = vfs_getattr(&lowerpath, &stat);
|
|
||||||
if (err)
|
|
||||||
goto out_dput_parent;
|
|
||||||
|
|
||||||
if (no_data)
|
|
||||||
stat.size = 0;
|
|
||||||
|
|
||||||
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
|
|
||||||
|
|
||||||
out_dput_parent:
|
|
||||||
dput(parent);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *upperdentry;
|
|
||||||
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = ovl_copy_up(dentry);
|
|
||||||
if (!err) {
|
|
||||||
upperdentry = ovl_dentry_upper(dentry);
|
|
||||||
|
|
||||||
mutex_lock(&upperdentry->d_inode->i_mutex);
|
|
||||||
err = notify_change(upperdentry, attr, NULL);
|
|
||||||
mutex_unlock(&upperdentry->d_inode->i_mutex);
|
|
||||||
}
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
|
||||||
struct kstat *stat)
|
|
||||||
{
|
|
||||||
struct path realpath;
|
|
||||||
|
|
||||||
ovl_path_real(dentry, &realpath);
|
|
||||||
return vfs_getattr(&realpath, stat);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_permission(struct inode *inode, int mask)
|
|
||||||
{
|
|
||||||
struct ovl_entry *oe;
|
|
||||||
struct dentry *alias = NULL;
|
|
||||||
struct inode *realinode;
|
|
||||||
struct dentry *realdentry;
|
|
||||||
bool is_upper;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (S_ISDIR(inode->i_mode)) {
|
|
||||||
oe = inode->i_private;
|
|
||||||
} else if (mask & MAY_NOT_BLOCK) {
|
|
||||||
return -ECHILD;
|
|
||||||
} else {
|
|
||||||
/*
|
|
||||||
* For non-directories find an alias and get the info
|
|
||||||
* from there.
|
|
||||||
*/
|
|
||||||
alias = d_find_any_alias(inode);
|
|
||||||
if (WARN_ON(!alias))
|
|
||||||
return -ENOENT;
|
|
||||||
|
|
||||||
oe = alias->d_fsdata;
|
|
||||||
}
|
|
||||||
|
|
||||||
realdentry = ovl_entry_real(oe, &is_upper);
|
|
||||||
|
|
||||||
/* Careful in RCU walk mode */
|
|
||||||
realinode = ACCESS_ONCE(realdentry->d_inode);
|
|
||||||
if (!realinode) {
|
|
||||||
WARN_ON(!(mask & MAY_NOT_BLOCK));
|
|
||||||
err = -ENOENT;
|
|
||||||
goto out_dput;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (mask & MAY_WRITE) {
|
|
||||||
umode_t mode = realinode->i_mode;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Writes will always be redirected to upper layer, so
|
|
||||||
* ignore lower layer being read-only.
|
|
||||||
*
|
|
||||||
* If the overlay itself is read-only then proceed
|
|
||||||
* with the permission check, don't return EROFS.
|
|
||||||
* This will only happen if this is the lower layer of
|
|
||||||
* another overlayfs.
|
|
||||||
*
|
|
||||||
* If upper fs becomes read-only after the overlay was
|
|
||||||
* constructed return EROFS to prevent modification of
|
|
||||||
* upper layer.
|
|
||||||
*/
|
|
||||||
err = -EROFS;
|
|
||||||
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
|
|
||||||
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
|
|
||||||
goto out_dput;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = __inode_permission(realinode, mask);
|
|
||||||
out_dput:
|
|
||||||
dput(alias);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
struct ovl_link_data {
|
|
||||||
struct dentry *realdentry;
|
|
||||||
void *cookie;
|
|
||||||
};
|
|
||||||
|
|
||||||
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
|
|
||||||
{
|
|
||||||
void *ret;
|
|
||||||
struct dentry *realdentry;
|
|
||||||
struct inode *realinode;
|
|
||||||
struct ovl_link_data *data = NULL;
|
|
||||||
|
|
||||||
realdentry = ovl_dentry_real(dentry);
|
|
||||||
realinode = realdentry->d_inode;
|
|
||||||
|
|
||||||
if (WARN_ON(!realinode->i_op->follow_link))
|
|
||||||
return ERR_PTR(-EPERM);
|
|
||||||
|
|
||||||
if (realinode->i_op->put_link) {
|
|
||||||
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
|
|
||||||
if (!data)
|
|
||||||
return ERR_PTR(-ENOMEM);
|
|
||||||
data->realdentry = realdentry;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = realinode->i_op->follow_link(realdentry, nd);
|
|
||||||
if (IS_ERR(ret)) {
|
|
||||||
kfree(data);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (data)
|
|
||||||
data->cookie = ret;
|
|
||||||
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
|
|
||||||
{
|
|
||||||
struct inode *realinode;
|
|
||||||
struct ovl_link_data *data = c;
|
|
||||||
|
|
||||||
if (!data)
|
|
||||||
return;
|
|
||||||
|
|
||||||
realinode = data->realdentry->d_inode;
|
|
||||||
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
|
|
||||||
kfree(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
|
|
||||||
{
|
|
||||||
struct path realpath;
|
|
||||||
struct inode *realinode;
|
|
||||||
|
|
||||||
ovl_path_real(dentry, &realpath);
|
|
||||||
realinode = realpath.dentry->d_inode;
|
|
||||||
|
|
||||||
if (!realinode->i_op->readlink)
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
touch_atime(&realpath);
|
|
||||||
|
|
||||||
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static bool ovl_is_private_xattr(const char *name)
|
|
||||||
{
|
|
||||||
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
|
||||||
const void *value, size_t size, int flags)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *upperdentry;
|
|
||||||
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = -EPERM;
|
|
||||||
if (ovl_is_private_xattr(name))
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
err = ovl_copy_up(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
upperdentry = ovl_dentry_upper(dentry);
|
|
||||||
err = vfs_setxattr(upperdentry, name, value, size, flags);
|
|
||||||
|
|
||||||
out_drop_write:
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ovl_need_xattr_filter(struct dentry *dentry,
|
|
||||||
enum ovl_path_type type)
|
|
||||||
{
|
|
||||||
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
|
|
||||||
return S_ISDIR(dentry->d_inode->i_mode);
|
|
||||||
else
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
|
||||||
void *value, size_t size)
|
|
||||||
{
|
|
||||||
struct path realpath;
|
|
||||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
|
||||||
|
|
||||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
|
||||||
return -ENODATA;
|
|
||||||
|
|
||||||
return vfs_getxattr(realpath.dentry, name, value, size);
|
|
||||||
}
|
|
||||||
|
|
||||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
|
|
||||||
{
|
|
||||||
struct path realpath;
|
|
||||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
|
||||||
ssize_t res;
|
|
||||||
int off;
|
|
||||||
|
|
||||||
res = vfs_listxattr(realpath.dentry, list, size);
|
|
||||||
if (res <= 0 || size == 0)
|
|
||||||
return res;
|
|
||||||
|
|
||||||
if (!ovl_need_xattr_filter(dentry, type))
|
|
||||||
return res;
|
|
||||||
|
|
||||||
/* filter out private xattrs */
|
|
||||||
for (off = 0; off < res;) {
|
|
||||||
char *s = list + off;
|
|
||||||
size_t slen = strlen(s) + 1;
|
|
||||||
|
|
||||||
BUG_ON(off + slen > res);
|
|
||||||
|
|
||||||
if (ovl_is_private_xattr(s)) {
|
|
||||||
res -= slen;
|
|
||||||
memmove(s, s + slen, res - off);
|
|
||||||
} else {
|
|
||||||
off += slen;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_removexattr(struct dentry *dentry, const char *name)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct path realpath;
|
|
||||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
|
||||||
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = -ENODATA;
|
|
||||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
if (!OVL_TYPE_UPPER(type)) {
|
|
||||||
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
|
|
||||||
if (err < 0)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
err = ovl_copy_up(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
ovl_path_upper(dentry, &realpath);
|
|
||||||
}
|
|
||||||
|
|
||||||
err = vfs_removexattr(realpath.dentry, name);
|
|
||||||
out_drop_write:
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
|
|
||||||
struct dentry *realdentry)
|
|
||||||
{
|
|
||||||
if (OVL_TYPE_UPPER(type))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (special_file(realdentry->d_inode->i_mode))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
|
|
||||||
const struct cred *cred)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct path realpath;
|
|
||||||
enum ovl_path_type type;
|
|
||||||
bool want_write = false;
|
|
||||||
|
|
||||||
type = ovl_path_real(dentry, &realpath);
|
|
||||||
if (!ovl_is_nocopyupw(dentry)) {
|
|
||||||
if (ovl_open_need_copy_up(file->f_flags, type,
|
|
||||||
realpath.dentry)) {
|
|
||||||
want_write = true;
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
if (file->f_flags & O_TRUNC)
|
|
||||||
err = ovl_copy_up_last(dentry, NULL, true);
|
|
||||||
else
|
|
||||||
err = ovl_copy_up(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
ovl_path_upper(dentry, &realpath);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
err = vfs_open(&realpath, file, cred);
|
|
||||||
out_drop_write:
|
|
||||||
if (want_write)
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const struct inode_operations_wrapper ovl_file_inode_operations = {
|
|
||||||
.ops = {
|
|
||||||
.setattr = ovl_setattr,
|
|
||||||
.permission = ovl_permission,
|
|
||||||
.getattr = ovl_getattr,
|
|
||||||
.setxattr = ovl_setxattr,
|
|
||||||
.getxattr = ovl_getxattr,
|
|
||||||
.listxattr = ovl_listxattr,
|
|
||||||
.removexattr = ovl_removexattr,
|
|
||||||
},
|
|
||||||
.dentry_open = ovl_dentry_open,
|
|
||||||
};
|
|
||||||
|
|
||||||
static const struct inode_operations ovl_symlink_inode_operations = {
|
|
||||||
.setattr = ovl_setattr,
|
|
||||||
.follow_link = ovl_follow_link,
|
|
||||||
.put_link = ovl_put_link,
|
|
||||||
.readlink = ovl_readlink,
|
|
||||||
.getattr = ovl_getattr,
|
|
||||||
.setxattr = ovl_setxattr,
|
|
||||||
.getxattr = ovl_getxattr,
|
|
||||||
.listxattr = ovl_listxattr,
|
|
||||||
.removexattr = ovl_removexattr,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
|
||||||
struct ovl_entry *oe)
|
|
||||||
{
|
|
||||||
struct inode *inode;
|
|
||||||
|
|
||||||
inode = new_inode(sb);
|
|
||||||
if (!inode)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
mode &= S_IFMT;
|
|
||||||
|
|
||||||
inode->i_ino = get_next_ino();
|
|
||||||
inode->i_mode = mode;
|
|
||||||
inode->i_flags |= S_NOATIME | S_NOCMTIME;
|
|
||||||
|
|
||||||
switch (mode) {
|
|
||||||
case S_IFDIR:
|
|
||||||
inode->i_private = oe;
|
|
||||||
inode->i_op = &ovl_dir_inode_operations.ops;
|
|
||||||
inode->i_fop = &ovl_dir_operations;
|
|
||||||
inode->i_flags |= S_IOPS_WRAPPER;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case S_IFLNK:
|
|
||||||
inode->i_op = &ovl_symlink_inode_operations;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case S_IFREG:
|
|
||||||
case S_IFSOCK:
|
|
||||||
case S_IFBLK:
|
|
||||||
case S_IFCHR:
|
|
||||||
case S_IFIFO:
|
|
||||||
inode->i_op = &ovl_file_inode_operations.ops;
|
|
||||||
inode->i_flags |= S_IOPS_WRAPPER;
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
WARN(1, "illegal file type: %i\n", mode);
|
|
||||||
iput(inode);
|
|
||||||
inode = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return inode;
|
|
||||||
}
|
|
||||||
@@ -1,200 +0,0 @@
|
|||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/kernel.h>
|
|
||||||
|
|
||||||
struct ovl_entry;
|
|
||||||
|
|
||||||
enum ovl_path_type {
|
|
||||||
__OVL_PATH_PURE = (1 << 0),
|
|
||||||
__OVL_PATH_UPPER = (1 << 1),
|
|
||||||
__OVL_PATH_MERGE = (1 << 2),
|
|
||||||
};
|
|
||||||
|
|
||||||
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
|
|
||||||
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
|
|
||||||
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
|
|
||||||
#define OVL_TYPE_MERGE_OR_LOWER(type) \
|
|
||||||
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
|
|
||||||
|
|
||||||
#define OVL_XATTR_PRE_NAME "trusted.overlay."
|
|
||||||
#define OVL_XATTR_PRE_LEN 16
|
|
||||||
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
|
|
||||||
|
|
||||||
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err = vfs_rmdir(dir, dentry);
|
|
||||||
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err = vfs_unlink(dir, dentry, NULL);
|
|
||||||
pr_debug("unlink(%pd2) = %i\n", dentry, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
|
|
||||||
struct dentry *new_dentry, bool debug)
|
|
||||||
{
|
|
||||||
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
|
|
||||||
if (debug) {
|
|
||||||
pr_debug("link(%pd2, %pd2) = %i\n",
|
|
||||||
old_dentry, new_dentry, err);
|
|
||||||
}
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
|
|
||||||
umode_t mode, bool debug)
|
|
||||||
{
|
|
||||||
int err = vfs_create(dir, dentry, mode, true);
|
|
||||||
if (debug)
|
|
||||||
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
|
|
||||||
umode_t mode, bool debug)
|
|
||||||
{
|
|
||||||
int err = vfs_mkdir(dir, dentry, mode);
|
|
||||||
if (debug)
|
|
||||||
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
|
|
||||||
umode_t mode, dev_t dev, bool debug)
|
|
||||||
{
|
|
||||||
int err = vfs_mknod(dir, dentry, mode, dev);
|
|
||||||
if (debug) {
|
|
||||||
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
|
|
||||||
dentry, mode, dev, err);
|
|
||||||
}
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
|
|
||||||
const char *oldname, bool debug)
|
|
||||||
{
|
|
||||||
int err = vfs_symlink(dir, dentry, oldname);
|
|
||||||
if (debug)
|
|
||||||
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
|
|
||||||
const void *value, size_t size, int flags)
|
|
||||||
{
|
|
||||||
int err = vfs_setxattr(dentry, name, value, size, flags);
|
|
||||||
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
|
|
||||||
dentry, name, (int) size, (char *) value, flags, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
|
|
||||||
{
|
|
||||||
int err = vfs_removexattr(dentry, name);
|
|
||||||
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
|
|
||||||
struct inode *newdir, struct dentry *newdentry,
|
|
||||||
unsigned int flags)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
|
|
||||||
olddentry, newdentry, flags);
|
|
||||||
|
|
||||||
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
|
|
||||||
|
|
||||||
if (err) {
|
|
||||||
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
|
|
||||||
olddentry, newdentry, err);
|
|
||||||
}
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err = vfs_whiteout(dir, dentry);
|
|
||||||
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ovl_is_nocopyupw(struct dentry *dentry);
|
|
||||||
enum ovl_path_type ovl_path_type(struct dentry *dentry);
|
|
||||||
u64 ovl_dentry_version_get(struct dentry *dentry);
|
|
||||||
void ovl_dentry_version_inc(struct dentry *dentry);
|
|
||||||
void ovl_path_upper(struct dentry *dentry, struct path *path);
|
|
||||||
void ovl_path_lower(struct dentry *dentry, struct path *path);
|
|
||||||
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
|
|
||||||
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
|
|
||||||
struct dentry *ovl_dentry_upper(struct dentry *dentry);
|
|
||||||
struct dentry *ovl_dentry_lower(struct dentry *dentry);
|
|
||||||
struct dentry *ovl_dentry_real(struct dentry *dentry);
|
|
||||||
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
|
|
||||||
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
|
|
||||||
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
|
|
||||||
struct dentry *ovl_workdir(struct dentry *dentry);
|
|
||||||
int ovl_want_write(struct dentry *dentry);
|
|
||||||
void ovl_drop_write(struct dentry *dentry);
|
|
||||||
bool ovl_dentry_is_opaque(struct dentry *dentry);
|
|
||||||
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
|
|
||||||
bool ovl_is_whiteout(struct dentry *dentry);
|
|
||||||
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
|
|
||||||
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
|
|
||||||
unsigned int flags);
|
|
||||||
struct file *ovl_path_open(struct path *path, int flags);
|
|
||||||
|
|
||||||
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
|
|
||||||
struct kstat *stat, const char *link);
|
|
||||||
|
|
||||||
/* readdir.c */
|
|
||||||
extern const struct file_operations ovl_dir_operations;
|
|
||||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
|
|
||||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
|
|
||||||
void ovl_cache_free(struct list_head *list);
|
|
||||||
|
|
||||||
/* inode.c */
|
|
||||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
|
|
||||||
int ovl_permission(struct inode *inode, int mask);
|
|
||||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
|
||||||
const void *value, size_t size, int flags);
|
|
||||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
|
||||||
void *value, size_t size);
|
|
||||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
|
|
||||||
int ovl_removexattr(struct dentry *dentry, const char *name);
|
|
||||||
|
|
||||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
|
||||||
struct ovl_entry *oe);
|
|
||||||
static inline void ovl_copyattr(struct inode *from, struct inode *to)
|
|
||||||
{
|
|
||||||
to->i_uid = from->i_uid;
|
|
||||||
to->i_gid = from->i_gid;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* dir.c */
|
|
||||||
extern const struct inode_operations_wrapper ovl_dir_inode_operations;
|
|
||||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
|
|
||||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
|
||||||
struct kstat *stat, const char *link,
|
|
||||||
struct dentry *hardlink, bool debug);
|
|
||||||
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
|
|
||||||
|
|
||||||
/* copy_up.c */
|
|
||||||
int ovl_copy_up(struct dentry *dentry);
|
|
||||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
|
||||||
struct path *lowerpath, struct kstat *stat,
|
|
||||||
struct iattr *attr);
|
|
||||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
|
|
||||||
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
|
|
||||||
@@ -1,626 +0,0 @@
|
|||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/fs.h>
|
|
||||||
#include <linux/slab.h>
|
|
||||||
#include <linux/namei.h>
|
|
||||||
#include <linux/file.h>
|
|
||||||
#include <linux/xattr.h>
|
|
||||||
#include <linux/rbtree.h>
|
|
||||||
#include <linux/security.h>
|
|
||||||
#include <linux/cred.h>
|
|
||||||
#include <linux/version.h>
|
|
||||||
#include "overlayfs.h"
|
|
||||||
|
|
||||||
struct ovl_cache_entry {
|
|
||||||
unsigned int len;
|
|
||||||
unsigned int type;
|
|
||||||
u64 ino;
|
|
||||||
struct list_head l_node;
|
|
||||||
struct rb_node node;
|
|
||||||
struct ovl_cache_entry *next_maybe_whiteout;
|
|
||||||
bool is_whiteout;
|
|
||||||
char name[];
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ovl_dir_cache {
|
|
||||||
long refcount;
|
|
||||||
u64 version;
|
|
||||||
struct list_head entries;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* vfs_readdir vs. iterate_dir compat */
|
|
||||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 11, 0) || \
|
|
||||||
(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 5))
|
|
||||||
#define USE_ITERATE_DIR 1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef USE_ITERATE_DIR
|
|
||||||
struct dir_context {
|
|
||||||
const filldir_t actor;
|
|
||||||
//loff_t pos;
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct ovl_readdir_data {
|
|
||||||
struct dir_context ctx;
|
|
||||||
bool is_merge;
|
|
||||||
struct rb_root root;
|
|
||||||
struct list_head *list;
|
|
||||||
struct list_head middle;
|
|
||||||
struct ovl_cache_entry *first_maybe_whiteout;
|
|
||||||
int count;
|
|
||||||
int err;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ovl_dir_file {
|
|
||||||
bool is_real;
|
|
||||||
bool is_upper;
|
|
||||||
struct ovl_dir_cache *cache;
|
|
||||||
struct list_head *cursor;
|
|
||||||
struct file *realfile;
|
|
||||||
struct file *upperfile;
|
|
||||||
};
|
|
||||||
|
|
||||||
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
|
|
||||||
{
|
|
||||||
return container_of(n, struct ovl_cache_entry, node);
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
|
|
||||||
const char *name, int len)
|
|
||||||
{
|
|
||||||
struct rb_node *node = root->rb_node;
|
|
||||||
int cmp;
|
|
||||||
|
|
||||||
while (node) {
|
|
||||||
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
|
|
||||||
|
|
||||||
cmp = strncmp(name, p->name, len);
|
|
||||||
if (cmp > 0)
|
|
||||||
node = p->node.rb_right;
|
|
||||||
else if (cmp < 0 || len < p->len)
|
|
||||||
node = p->node.rb_left;
|
|
||||||
else
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
|
|
||||||
const char *name, int len,
|
|
||||||
u64 ino, unsigned int d_type)
|
|
||||||
{
|
|
||||||
struct ovl_cache_entry *p;
|
|
||||||
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
|
|
||||||
|
|
||||||
p = kmalloc(size, GFP_KERNEL);
|
|
||||||
if (!p)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
memcpy(p->name, name, len);
|
|
||||||
p->name[len] = '\0';
|
|
||||||
p->len = len;
|
|
||||||
p->type = d_type;
|
|
||||||
p->ino = ino;
|
|
||||||
p->is_whiteout = false;
|
|
||||||
|
|
||||||
if (d_type == DT_CHR) {
|
|
||||||
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
|
|
||||||
rdd->first_maybe_whiteout = p;
|
|
||||||
}
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
|
|
||||||
const char *name, int len, u64 ino,
|
|
||||||
unsigned int d_type)
|
|
||||||
{
|
|
||||||
struct rb_node **newp = &rdd->root.rb_node;
|
|
||||||
struct rb_node *parent = NULL;
|
|
||||||
struct ovl_cache_entry *p;
|
|
||||||
|
|
||||||
while (*newp) {
|
|
||||||
int cmp;
|
|
||||||
struct ovl_cache_entry *tmp;
|
|
||||||
|
|
||||||
parent = *newp;
|
|
||||||
tmp = ovl_cache_entry_from_node(*newp);
|
|
||||||
cmp = strncmp(name, tmp->name, len);
|
|
||||||
if (cmp > 0)
|
|
||||||
newp = &tmp->node.rb_right;
|
|
||||||
else if (cmp < 0 || len < tmp->len)
|
|
||||||
newp = &tmp->node.rb_left;
|
|
||||||
else
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
|
|
||||||
if (p == NULL)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
list_add_tail(&p->l_node, rdd->list);
|
|
||||||
rb_link_node(&p->node, parent, newp);
|
|
||||||
rb_insert_color(&p->node, &rdd->root);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
|
|
||||||
const char *name, int namelen,
|
|
||||||
loff_t offset, u64 ino, unsigned int d_type)
|
|
||||||
{
|
|
||||||
struct ovl_cache_entry *p;
|
|
||||||
|
|
||||||
p = ovl_cache_entry_find(&rdd->root, name, namelen);
|
|
||||||
if (p) {
|
|
||||||
list_move_tail(&p->l_node, &rdd->middle);
|
|
||||||
} else {
|
|
||||||
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
|
|
||||||
if (p == NULL)
|
|
||||||
rdd->err = -ENOMEM;
|
|
||||||
else
|
|
||||||
list_add_tail(&p->l_node, &rdd->middle);
|
|
||||||
}
|
|
||||||
|
|
||||||
return rdd->err;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_cache_free(struct list_head *list)
|
|
||||||
{
|
|
||||||
struct ovl_cache_entry *p;
|
|
||||||
struct ovl_cache_entry *n;
|
|
||||||
|
|
||||||
list_for_each_entry_safe(p, n, list, l_node)
|
|
||||||
kfree(p);
|
|
||||||
|
|
||||||
INIT_LIST_HEAD(list);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct ovl_dir_cache *cache = od->cache;
|
|
||||||
|
|
||||||
WARN_ON(cache->refcount <= 0);
|
|
||||||
cache->refcount--;
|
|
||||||
if (!cache->refcount) {
|
|
||||||
if (ovl_dir_cache(dentry) == cache)
|
|
||||||
ovl_set_dir_cache(dentry, NULL);
|
|
||||||
|
|
||||||
ovl_cache_free(&cache->entries);
|
|
||||||
kfree(cache);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_fill_merge(void *buf, const char *name, int namelen,
|
|
||||||
loff_t offset, u64 ino, unsigned int d_type)
|
|
||||||
{
|
|
||||||
struct dir_context *ctx = buf;
|
|
||||||
struct ovl_readdir_data *rdd =
|
|
||||||
container_of(ctx, struct ovl_readdir_data, ctx);
|
|
||||||
|
|
||||||
rdd->count++;
|
|
||||||
if (!rdd->is_merge)
|
|
||||||
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
|
|
||||||
else
|
|
||||||
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct ovl_cache_entry *p;
|
|
||||||
struct dentry *dentry;
|
|
||||||
const struct cred *old_cred;
|
|
||||||
struct cred *override_cred;
|
|
||||||
|
|
||||||
override_cred = prepare_creds();
|
|
||||||
if (!override_cred)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* CAP_DAC_OVERRIDE for lookup
|
|
||||||
*/
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
|
||||||
old_cred = override_creds(override_cred);
|
|
||||||
|
|
||||||
err = mutex_lock_killable(&dir->d_inode->i_mutex);
|
|
||||||
if (!err) {
|
|
||||||
while (rdd->first_maybe_whiteout) {
|
|
||||||
p = rdd->first_maybe_whiteout;
|
|
||||||
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
|
|
||||||
dentry = lookup_one_len(p->name, dir, p->len);
|
|
||||||
if (!IS_ERR(dentry)) {
|
|
||||||
p->is_whiteout = ovl_is_whiteout(dentry);
|
|
||||||
dput(dentry);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mutex_unlock(&dir->d_inode->i_mutex);
|
|
||||||
}
|
|
||||||
revert_creds(old_cred);
|
|
||||||
put_cred(override_cred);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_dir_read(struct path *realpath,
|
|
||||||
struct ovl_readdir_data *rdd)
|
|
||||||
{
|
|
||||||
struct file *realfile;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
|
|
||||||
if (IS_ERR(realfile))
|
|
||||||
return PTR_ERR(realfile);
|
|
||||||
|
|
||||||
rdd->first_maybe_whiteout = NULL;
|
|
||||||
//rdd->ctx.pos = 0;
|
|
||||||
do {
|
|
||||||
rdd->count = 0;
|
|
||||||
rdd->err = 0;
|
|
||||||
#ifdef USE_ITERATE_DIR
|
|
||||||
err = iterate_dir(realfile, &rdd->ctx);
|
|
||||||
#else
|
|
||||||
err = vfs_readdir(realfile, rdd->ctx.actor, rdd);
|
|
||||||
#endif
|
|
||||||
if (err >= 0)
|
|
||||||
err = rdd->err;
|
|
||||||
} while (!err && rdd->count);
|
|
||||||
|
|
||||||
if (!err && rdd->first_maybe_whiteout)
|
|
||||||
err = ovl_check_whiteouts(realpath->dentry, rdd);
|
|
||||||
|
|
||||||
fput(realfile);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ovl_dir_reset(struct file *file)
|
|
||||||
{
|
|
||||||
struct ovl_dir_file *od = file->private_data;
|
|
||||||
struct ovl_dir_cache *cache = od->cache;
|
|
||||||
struct dentry *dentry = file->f_path.dentry;
|
|
||||||
enum ovl_path_type type = ovl_path_type(dentry);
|
|
||||||
|
|
||||||
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
|
|
||||||
ovl_cache_put(od, dentry);
|
|
||||||
od->cache = NULL;
|
|
||||||
od->cursor = NULL;
|
|
||||||
}
|
|
||||||
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
|
|
||||||
if (od->is_real && OVL_TYPE_MERGE(type))
|
|
||||||
od->is_real = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct path realpath;
|
|
||||||
struct ovl_readdir_data rdd = {
|
|
||||||
.ctx.actor = ovl_fill_merge,
|
|
||||||
.list = list,
|
|
||||||
.root = RB_ROOT,
|
|
||||||
.is_merge = false,
|
|
||||||
};
|
|
||||||
int idx, next;
|
|
||||||
|
|
||||||
for (idx = 0; idx != -1; idx = next) {
|
|
||||||
next = ovl_path_next(idx, dentry, &realpath);
|
|
||||||
|
|
||||||
if (next != -1) {
|
|
||||||
err = ovl_dir_read(&realpath, &rdd);
|
|
||||||
if (err)
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
/*
|
|
||||||
* Insert lowest layer entries before upper ones, this
|
|
||||||
* allows offsets to be reasonably constant
|
|
||||||
*/
|
|
||||||
list_add(&rdd.middle, rdd.list);
|
|
||||||
rdd.is_merge = true;
|
|
||||||
err = ovl_dir_read(&realpath, &rdd);
|
|
||||||
list_del(&rdd.middle);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
|
|
||||||
{
|
|
||||||
struct list_head *p;
|
|
||||||
loff_t off = 0;
|
|
||||||
|
|
||||||
list_for_each(p, &od->cache->entries) {
|
|
||||||
if (off >= pos)
|
|
||||||
break;
|
|
||||||
off++;
|
|
||||||
}
|
|
||||||
/* Cursor is safe since the cache is stable */
|
|
||||||
od->cursor = p;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int res;
|
|
||||||
struct ovl_dir_cache *cache;
|
|
||||||
|
|
||||||
cache = ovl_dir_cache(dentry);
|
|
||||||
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
|
|
||||||
cache->refcount++;
|
|
||||||
return cache;
|
|
||||||
}
|
|
||||||
ovl_set_dir_cache(dentry, NULL);
|
|
||||||
|
|
||||||
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
|
|
||||||
if (!cache)
|
|
||||||
return ERR_PTR(-ENOMEM);
|
|
||||||
|
|
||||||
cache->refcount = 1;
|
|
||||||
INIT_LIST_HEAD(&cache->entries);
|
|
||||||
|
|
||||||
res = ovl_dir_read_merged(dentry, &cache->entries);
|
|
||||||
if (res) {
|
|
||||||
ovl_cache_free(&cache->entries);
|
|
||||||
kfree(cache);
|
|
||||||
return ERR_PTR(res);
|
|
||||||
}
|
|
||||||
|
|
||||||
cache->version = ovl_dentry_version_get(dentry);
|
|
||||||
ovl_set_dir_cache(dentry, cache);
|
|
||||||
|
|
||||||
return cache;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef USE_ITERATE_DIR
|
|
||||||
struct iterate_wrapper {
|
|
||||||
struct dir_context ctx;
|
|
||||||
filldir_t actor;
|
|
||||||
void *buf;
|
|
||||||
};
|
|
||||||
|
|
||||||
static int ovl_wrap_readdir(void *ctx, const char *name, int namelen,
|
|
||||||
loff_t offset, u64 ino, unsigned int d_type)
|
|
||||||
{
|
|
||||||
struct iterate_wrapper *w = ctx;
|
|
||||||
|
|
||||||
return w->actor(w->buf, name, namelen, offset, ino, d_type);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
|
|
||||||
{
|
|
||||||
struct ovl_dir_file *od = file->private_data;
|
|
||||||
struct dentry *dentry = file->f_path.dentry;
|
|
||||||
struct ovl_cache_entry *p;
|
|
||||||
int res;
|
|
||||||
|
|
||||||
if (!file->f_pos)
|
|
||||||
ovl_dir_reset(file);
|
|
||||||
|
|
||||||
if (od->is_real) {
|
|
||||||
#ifdef USE_ITERATE_DIR
|
|
||||||
struct iterate_wrapper w = {
|
|
||||||
.ctx.actor = ovl_wrap_readdir,
|
|
||||||
.actor = filler,
|
|
||||||
.buf = buf,
|
|
||||||
};
|
|
||||||
res = iterate_dir(od->realfile, &w.ctx);
|
|
||||||
#else
|
|
||||||
res = vfs_readdir(od->realfile, filler, buf);
|
|
||||||
#endif
|
|
||||||
file->f_pos = od->realfile->f_pos;
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!od->cache) {
|
|
||||||
struct ovl_dir_cache *cache;
|
|
||||||
|
|
||||||
cache = ovl_cache_get(dentry);
|
|
||||||
if (IS_ERR(cache))
|
|
||||||
return PTR_ERR(cache);
|
|
||||||
|
|
||||||
od->cache = cache;
|
|
||||||
ovl_seek_cursor(od, file->f_pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
while (od->cursor != &od->cache->entries) {
|
|
||||||
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
|
|
||||||
if (!p->is_whiteout)
|
|
||||||
if (filler(buf, p->name, p->len, file->f_pos, p->ino, p->type))
|
|
||||||
break;
|
|
||||||
od->cursor = p->l_node.next;
|
|
||||||
file->f_pos++;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
|
|
||||||
{
|
|
||||||
loff_t res;
|
|
||||||
struct ovl_dir_file *od = file->private_data;
|
|
||||||
|
|
||||||
mutex_lock(&file_inode(file)->i_mutex);
|
|
||||||
if (!file->f_pos)
|
|
||||||
ovl_dir_reset(file);
|
|
||||||
|
|
||||||
if (od->is_real) {
|
|
||||||
res = vfs_llseek(od->realfile, offset, origin);
|
|
||||||
file->f_pos = od->realfile->f_pos;
|
|
||||||
} else {
|
|
||||||
res = -EINVAL;
|
|
||||||
|
|
||||||
switch (origin) {
|
|
||||||
case SEEK_CUR:
|
|
||||||
offset += file->f_pos;
|
|
||||||
break;
|
|
||||||
case SEEK_SET:
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
goto out_unlock;
|
|
||||||
}
|
|
||||||
if (offset < 0)
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
if (offset != file->f_pos) {
|
|
||||||
file->f_pos = offset;
|
|
||||||
if (od->cache)
|
|
||||||
ovl_seek_cursor(od, offset);
|
|
||||||
}
|
|
||||||
res = offset;
|
|
||||||
}
|
|
||||||
out_unlock:
|
|
||||||
mutex_unlock(&file_inode(file)->i_mutex);
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
|
|
||||||
int datasync)
|
|
||||||
{
|
|
||||||
struct ovl_dir_file *od = file->private_data;
|
|
||||||
struct dentry *dentry = file->f_path.dentry;
|
|
||||||
struct file *realfile = od->realfile;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Need to check if we started out being a lower dir, but got copied up
|
|
||||||
*/
|
|
||||||
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
|
|
||||||
struct inode *inode = file_inode(file);
|
|
||||||
|
|
||||||
realfile = lockless_dereference(od->upperfile);
|
|
||||||
if (!realfile) {
|
|
||||||
struct path upperpath;
|
|
||||||
|
|
||||||
ovl_path_upper(dentry, &upperpath);
|
|
||||||
realfile = ovl_path_open(&upperpath, O_RDONLY);
|
|
||||||
smp_mb__before_spinlock();
|
|
||||||
mutex_lock(&inode->i_mutex);
|
|
||||||
if (!od->upperfile) {
|
|
||||||
if (IS_ERR(realfile)) {
|
|
||||||
mutex_unlock(&inode->i_mutex);
|
|
||||||
return PTR_ERR(realfile);
|
|
||||||
}
|
|
||||||
od->upperfile = realfile;
|
|
||||||
} else {
|
|
||||||
/* somebody has beaten us to it */
|
|
||||||
if (!IS_ERR(realfile))
|
|
||||||
fput(realfile);
|
|
||||||
realfile = od->upperfile;
|
|
||||||
}
|
|
||||||
mutex_unlock(&inode->i_mutex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return vfs_fsync_range(realfile, start, end, datasync);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_dir_release(struct inode *inode, struct file *file)
|
|
||||||
{
|
|
||||||
struct ovl_dir_file *od = file->private_data;
|
|
||||||
|
|
||||||
if (od->cache) {
|
|
||||||
mutex_lock(&inode->i_mutex);
|
|
||||||
ovl_cache_put(od, file->f_path.dentry);
|
|
||||||
mutex_unlock(&inode->i_mutex);
|
|
||||||
}
|
|
||||||
fput(od->realfile);
|
|
||||||
if (od->upperfile)
|
|
||||||
fput(od->upperfile);
|
|
||||||
kfree(od);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_dir_open(struct inode *inode, struct file *file)
|
|
||||||
{
|
|
||||||
struct path realpath;
|
|
||||||
struct file *realfile;
|
|
||||||
struct ovl_dir_file *od;
|
|
||||||
enum ovl_path_type type;
|
|
||||||
|
|
||||||
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
|
|
||||||
if (!od)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
type = ovl_path_real(file->f_path.dentry, &realpath);
|
|
||||||
realfile = ovl_path_open(&realpath, file->f_flags);
|
|
||||||
if (IS_ERR(realfile)) {
|
|
||||||
kfree(od);
|
|
||||||
return PTR_ERR(realfile);
|
|
||||||
}
|
|
||||||
od->realfile = realfile;
|
|
||||||
od->is_real = !OVL_TYPE_MERGE(type);
|
|
||||||
od->is_upper = OVL_TYPE_UPPER(type);
|
|
||||||
file->private_data = od;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
const struct file_operations ovl_dir_operations = {
|
|
||||||
.read = generic_read_dir,
|
|
||||||
.open = ovl_dir_open,
|
|
||||||
.readdir = ovl_readdir,
|
|
||||||
.llseek = ovl_dir_llseek,
|
|
||||||
.fsync = ovl_dir_fsync,
|
|
||||||
.release = ovl_dir_release,
|
|
||||||
};
|
|
||||||
|
|
||||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct ovl_cache_entry *p;
|
|
||||||
|
|
||||||
err = ovl_dir_read_merged(dentry, list);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
err = 0;
|
|
||||||
|
|
||||||
list_for_each_entry(p, list, l_node) {
|
|
||||||
if (p->is_whiteout)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (p->name[0] == '.') {
|
|
||||||
if (p->len == 1)
|
|
||||||
continue;
|
|
||||||
if (p->len == 2 && p->name[1] == '.')
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
err = -ENOTEMPTY;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
|
|
||||||
{
|
|
||||||
struct ovl_cache_entry *p;
|
|
||||||
|
|
||||||
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
|
|
||||||
list_for_each_entry(p, list, l_node) {
|
|
||||||
struct dentry *dentry;
|
|
||||||
|
|
||||||
if (!p->is_whiteout)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
dentry = lookup_one_len(p->name, upper, p->len);
|
|
||||||
if (IS_ERR(dentry)) {
|
|
||||||
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
|
|
||||||
upper->d_name.name, p->len, p->name,
|
|
||||||
(int) PTR_ERR(dentry));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
ovl_cleanup(upper->d_inode, dentry);
|
|
||||||
dput(dentry);
|
|
||||||
}
|
|
||||||
mutex_unlock(&upper->d_inode->i_mutex);
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,416 +0,0 @@
|
|||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/fs.h>
|
|
||||||
#include <linux/slab.h>
|
|
||||||
#include <linux/file.h>
|
|
||||||
#include <linux/splice.h>
|
|
||||||
#include <linux/xattr.h>
|
|
||||||
#include <linux/security.h>
|
|
||||||
#include <linux/uaccess.h>
|
|
||||||
#include <linux/sched.h>
|
|
||||||
#include <linux/namei.h>
|
|
||||||
#include "overlayfs.h"
|
|
||||||
|
|
||||||
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
|
|
||||||
|
|
||||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
|
|
||||||
{
|
|
||||||
ssize_t list_size, size;
|
|
||||||
char *buf, *name, *value;
|
|
||||||
int error;
|
|
||||||
|
|
||||||
if (!old->d_inode->i_op->getxattr ||
|
|
||||||
!new->d_inode->i_op->getxattr)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
list_size = vfs_listxattr(old, NULL, 0);
|
|
||||||
if (list_size <= 0) {
|
|
||||||
if (list_size == -EOPNOTSUPP)
|
|
||||||
return 0;
|
|
||||||
return list_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
buf = kzalloc(list_size, GFP_KERNEL);
|
|
||||||
if (!buf)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
error = -ENOMEM;
|
|
||||||
value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
|
|
||||||
if (!value)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
list_size = vfs_listxattr(old, buf, list_size);
|
|
||||||
if (list_size <= 0) {
|
|
||||||
error = list_size;
|
|
||||||
goto out_free_value;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
|
|
||||||
size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
|
|
||||||
if (size <= 0) {
|
|
||||||
error = size;
|
|
||||||
goto out_free_value;
|
|
||||||
}
|
|
||||||
error = vfs_setxattr(new, name, value, size, 0);
|
|
||||||
if (error)
|
|
||||||
goto out_free_value;
|
|
||||||
}
|
|
||||||
|
|
||||||
out_free_value:
|
|
||||||
kfree(value);
|
|
||||||
out:
|
|
||||||
kfree(buf);
|
|
||||||
return error;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
|
|
||||||
{
|
|
||||||
struct file *old_file;
|
|
||||||
struct file *new_file;
|
|
||||||
loff_t old_pos = 0;
|
|
||||||
loff_t new_pos = 0;
|
|
||||||
int error = 0;
|
|
||||||
|
|
||||||
if (len == 0)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
old_file = ovl_path_open(old, O_RDONLY);
|
|
||||||
if (IS_ERR(old_file))
|
|
||||||
return PTR_ERR(old_file);
|
|
||||||
|
|
||||||
new_file = ovl_path_open(new, O_WRONLY);
|
|
||||||
if (IS_ERR(new_file)) {
|
|
||||||
error = PTR_ERR(new_file);
|
|
||||||
goto out_fput;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* FIXME: copy up sparse files efficiently */
|
|
||||||
while (len) {
|
|
||||||
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
|
|
||||||
long bytes;
|
|
||||||
|
|
||||||
if (len < this_len)
|
|
||||||
this_len = len;
|
|
||||||
|
|
||||||
if (signal_pending_state(TASK_KILLABLE, current)) {
|
|
||||||
error = -EINTR;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
bytes = do_splice_direct(old_file, &old_pos,
|
|
||||||
new_file, &new_pos,
|
|
||||||
this_len, SPLICE_F_MOVE);
|
|
||||||
if (bytes <= 0) {
|
|
||||||
error = bytes;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
WARN_ON(old_pos != new_pos);
|
|
||||||
|
|
||||||
len -= bytes;
|
|
||||||
}
|
|
||||||
|
|
||||||
fput(new_file);
|
|
||||||
out_fput:
|
|
||||||
fput(old_file);
|
|
||||||
return error;
|
|
||||||
}
|
|
||||||
|
|
||||||
static char *ovl_read_symlink(struct dentry *realdentry)
|
|
||||||
{
|
|
||||||
int res;
|
|
||||||
char *buf;
|
|
||||||
struct inode *inode = realdentry->d_inode;
|
|
||||||
mm_segment_t old_fs;
|
|
||||||
|
|
||||||
res = -EINVAL;
|
|
||||||
if (!inode->i_op->readlink)
|
|
||||||
goto err;
|
|
||||||
|
|
||||||
res = -ENOMEM;
|
|
||||||
buf = (char *) __get_free_page(GFP_KERNEL);
|
|
||||||
if (!buf)
|
|
||||||
goto err;
|
|
||||||
|
|
||||||
old_fs = get_fs();
|
|
||||||
set_fs(get_ds());
|
|
||||||
/* The cast to a user pointer is valid due to the set_fs() */
|
|
||||||
res = inode->i_op->readlink(realdentry,
|
|
||||||
(char __user *)buf, PAGE_SIZE - 1);
|
|
||||||
set_fs(old_fs);
|
|
||||||
if (res < 0) {
|
|
||||||
free_page((unsigned long) buf);
|
|
||||||
goto err;
|
|
||||||
}
|
|
||||||
buf[res] = '\0';
|
|
||||||
|
|
||||||
return buf;
|
|
||||||
|
|
||||||
err:
|
|
||||||
return ERR_PTR(res);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
|
|
||||||
{
|
|
||||||
struct iattr attr = {
|
|
||||||
.ia_valid =
|
|
||||||
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
|
|
||||||
.ia_atime = stat->atime,
|
|
||||||
.ia_mtime = stat->mtime,
|
|
||||||
};
|
|
||||||
|
|
||||||
return notify_change(upperdentry, &attr, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
|
|
||||||
{
|
|
||||||
int err = 0;
|
|
||||||
|
|
||||||
if (!S_ISLNK(stat->mode)) {
|
|
||||||
struct iattr attr = {
|
|
||||||
.ia_valid = ATTR_MODE,
|
|
||||||
.ia_mode = stat->mode,
|
|
||||||
};
|
|
||||||
err = notify_change(upperdentry, &attr, NULL);
|
|
||||||
}
|
|
||||||
if (!err) {
|
|
||||||
struct iattr attr = {
|
|
||||||
.ia_valid = ATTR_UID | ATTR_GID,
|
|
||||||
.ia_uid = stat->uid,
|
|
||||||
.ia_gid = stat->gid,
|
|
||||||
};
|
|
||||||
err = notify_change(upperdentry, &attr, NULL);
|
|
||||||
}
|
|
||||||
if (!err)
|
|
||||||
ovl_set_timestamps(upperdentry, stat);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
|
|
||||||
struct dentry *dentry, struct path *lowerpath,
|
|
||||||
struct kstat *stat, struct iattr *attr,
|
|
||||||
const char *link)
|
|
||||||
{
|
|
||||||
struct inode *wdir = workdir->d_inode;
|
|
||||||
struct inode *udir = upperdir->d_inode;
|
|
||||||
struct dentry *newdentry = NULL;
|
|
||||||
struct dentry *upper = NULL;
|
|
||||||
umode_t mode = stat->mode;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
|
||||||
err = PTR_ERR(newdentry);
|
|
||||||
if (IS_ERR(newdentry))
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
|
||||||
dentry->d_name.len);
|
|
||||||
err = PTR_ERR(upper);
|
|
||||||
if (IS_ERR(upper))
|
|
||||||
goto out1;
|
|
||||||
|
|
||||||
/* Can't properly set mode on creation because of the umask */
|
|
||||||
stat->mode &= S_IFMT;
|
|
||||||
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
|
|
||||||
stat->mode = mode;
|
|
||||||
if (err)
|
|
||||||
goto out2;
|
|
||||||
|
|
||||||
if (S_ISREG(stat->mode)) {
|
|
||||||
struct path upperpath;
|
|
||||||
ovl_path_upper(dentry, &upperpath);
|
|
||||||
BUG_ON(upperpath.dentry != NULL);
|
|
||||||
upperpath.dentry = newdentry;
|
|
||||||
|
|
||||||
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
mutex_lock(&newdentry->d_inode->i_mutex);
|
|
||||||
err = ovl_set_attr(newdentry, stat);
|
|
||||||
if (!err && attr)
|
|
||||||
err = notify_change(newdentry, attr, NULL);
|
|
||||||
mutex_unlock(&newdentry->d_inode->i_mutex);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
ovl_dentry_update(dentry, newdentry);
|
|
||||||
newdentry = NULL;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Non-directores become opaque when copied up.
|
|
||||||
*/
|
|
||||||
if (!S_ISDIR(stat->mode))
|
|
||||||
ovl_dentry_set_opaque(dentry, true);
|
|
||||||
out2:
|
|
||||||
dput(upper);
|
|
||||||
out1:
|
|
||||||
dput(newdentry);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
|
|
||||||
out_cleanup:
|
|
||||||
ovl_cleanup(wdir, newdentry);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Copy up a single dentry
|
|
||||||
*
|
|
||||||
* Directory renames only allowed on "pure upper" (already created on
|
|
||||||
* upper filesystem, never copied up). Directories which are on lower or
|
|
||||||
* are merged may not be renamed. For these -EXDEV is returned and
|
|
||||||
* userspace has to deal with it. This means, when copying up a
|
|
||||||
* directory we can rely on it and ancestors being stable.
|
|
||||||
*
|
|
||||||
* Non-directory renames start with copy up of source if necessary. The
|
|
||||||
* actual rename will only proceed once the copy up was successful. Copy
|
|
||||||
* up uses upper parent i_mutex for exclusion. Since rename can change
|
|
||||||
* d_parent it is possible that the copy up will lock the old parent. At
|
|
||||||
* that point the file will have already been copied up anyway.
|
|
||||||
*/
|
|
||||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
|
||||||
struct path *lowerpath, struct kstat *stat,
|
|
||||||
struct iattr *attr)
|
|
||||||
{
|
|
||||||
struct dentry *workdir = ovl_workdir(dentry);
|
|
||||||
int err;
|
|
||||||
struct kstat pstat;
|
|
||||||
struct path parentpath;
|
|
||||||
struct dentry *upperdir;
|
|
||||||
struct dentry *upperdentry;
|
|
||||||
const struct cred *old_cred;
|
|
||||||
struct cred *override_cred;
|
|
||||||
char *link = NULL;
|
|
||||||
|
|
||||||
if (WARN_ON(!workdir))
|
|
||||||
return -EROFS;
|
|
||||||
|
|
||||||
ovl_path_upper(parent, &parentpath);
|
|
||||||
upperdir = parentpath.dentry;
|
|
||||||
|
|
||||||
err = vfs_getattr(&parentpath, &pstat);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
if (S_ISLNK(stat->mode)) {
|
|
||||||
link = ovl_read_symlink(lowerpath->dentry);
|
|
||||||
if (IS_ERR(link))
|
|
||||||
return PTR_ERR(link);
|
|
||||||
}
|
|
||||||
|
|
||||||
err = -ENOMEM;
|
|
||||||
override_cred = prepare_creds();
|
|
||||||
if (!override_cred)
|
|
||||||
goto out_free_link;
|
|
||||||
|
|
||||||
override_cred->fsuid = stat->uid;
|
|
||||||
override_cred->fsgid = stat->gid;
|
|
||||||
/*
|
|
||||||
* CAP_SYS_ADMIN for copying up extended attributes
|
|
||||||
* CAP_DAC_OVERRIDE for create
|
|
||||||
* CAP_FOWNER for chmod, timestamp update
|
|
||||||
* CAP_FSETID for chmod
|
|
||||||
* CAP_CHOWN for chown
|
|
||||||
* CAP_MKNOD for mknod
|
|
||||||
*/
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_MKNOD);
|
|
||||||
old_cred = override_creds(override_cred);
|
|
||||||
|
|
||||||
err = -EIO;
|
|
||||||
if (lock_rename(workdir, upperdir) != NULL) {
|
|
||||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
|
||||||
goto out_unlock;
|
|
||||||
}
|
|
||||||
upperdentry = ovl_dentry_upper(dentry);
|
|
||||||
if (upperdentry) {
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
err = 0;
|
|
||||||
/* Raced with another copy-up? Do the setattr here */
|
|
||||||
if (attr) {
|
|
||||||
mutex_lock(&upperdentry->d_inode->i_mutex);
|
|
||||||
err = notify_change(upperdentry, attr, NULL);
|
|
||||||
mutex_unlock(&upperdentry->d_inode->i_mutex);
|
|
||||||
}
|
|
||||||
goto out_put_cred;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
|
|
||||||
stat, attr, link);
|
|
||||||
if (!err) {
|
|
||||||
/* Restore timestamps on parent (best effort) */
|
|
||||||
ovl_set_timestamps(upperdir, &pstat);
|
|
||||||
}
|
|
||||||
out_unlock:
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
out_put_cred:
|
|
||||||
revert_creds(old_cred);
|
|
||||||
put_cred(override_cred);
|
|
||||||
|
|
||||||
out_free_link:
|
|
||||||
if (link)
|
|
||||||
free_page((unsigned long) link);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_copy_up(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
err = 0;
|
|
||||||
while (!err) {
|
|
||||||
struct dentry *next;
|
|
||||||
struct dentry *parent;
|
|
||||||
struct path lowerpath;
|
|
||||||
struct kstat stat;
|
|
||||||
enum ovl_path_type type = ovl_path_type(dentry);
|
|
||||||
|
|
||||||
if (OVL_TYPE_UPPER(type))
|
|
||||||
break;
|
|
||||||
|
|
||||||
next = dget(dentry);
|
|
||||||
/* find the topmost dentry not yet copied up */
|
|
||||||
for (;;) {
|
|
||||||
parent = dget_parent(next);
|
|
||||||
|
|
||||||
type = ovl_path_type(parent);
|
|
||||||
if (OVL_TYPE_UPPER(type))
|
|
||||||
break;
|
|
||||||
|
|
||||||
dput(next);
|
|
||||||
next = parent;
|
|
||||||
}
|
|
||||||
|
|
||||||
ovl_path_lower(next, &lowerpath);
|
|
||||||
err = vfs_getattr(&lowerpath, &stat);
|
|
||||||
if (!err)
|
|
||||||
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
|
|
||||||
|
|
||||||
dput(parent);
|
|
||||||
dput(next);
|
|
||||||
}
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
@@ -1,951 +0,0 @@
|
|||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/fs.h>
|
|
||||||
#include <linux/namei.h>
|
|
||||||
#include <linux/xattr.h>
|
|
||||||
#include <linux/security.h>
|
|
||||||
#include <linux/cred.h>
|
|
||||||
#include "overlayfs.h"
|
|
||||||
|
|
||||||
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
dget(wdentry);
|
|
||||||
if (d_is_dir(wdentry))
|
|
||||||
err = ovl_do_rmdir(wdir, wdentry);
|
|
||||||
else
|
|
||||||
err = ovl_do_unlink(wdir, wdentry);
|
|
||||||
dput(wdentry);
|
|
||||||
|
|
||||||
if (err) {
|
|
||||||
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
|
|
||||||
wdentry, err);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct dentry *temp;
|
|
||||||
char name[20];
|
|
||||||
|
|
||||||
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
|
|
||||||
|
|
||||||
temp = lookup_one_len(name, workdir, strlen(name));
|
|
||||||
if (!IS_ERR(temp) && temp->d_inode) {
|
|
||||||
pr_err("overlayfs: workdir/%s already exists\n", name);
|
|
||||||
dput(temp);
|
|
||||||
temp = ERR_PTR(-EIO);
|
|
||||||
}
|
|
||||||
|
|
||||||
return temp;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* caller holds i_mutex on workdir */
|
|
||||||
static struct dentry *ovl_whiteout(struct dentry *workdir,
|
|
||||||
struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *whiteout;
|
|
||||||
struct inode *wdir = workdir->d_inode;
|
|
||||||
|
|
||||||
whiteout = ovl_lookup_temp(workdir, dentry);
|
|
||||||
if (IS_ERR(whiteout))
|
|
||||||
return whiteout;
|
|
||||||
|
|
||||||
err = ovl_do_whiteout(wdir, whiteout);
|
|
||||||
if (err) {
|
|
||||||
dput(whiteout);
|
|
||||||
whiteout = ERR_PTR(err);
|
|
||||||
}
|
|
||||||
|
|
||||||
return whiteout;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
|
||||||
struct kstat *stat, const char *link,
|
|
||||||
struct dentry *hardlink, bool debug)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (newdentry->d_inode)
|
|
||||||
return -ESTALE;
|
|
||||||
|
|
||||||
if (hardlink) {
|
|
||||||
err = ovl_do_link(hardlink, dir, newdentry, debug);
|
|
||||||
} else {
|
|
||||||
switch (stat->mode & S_IFMT) {
|
|
||||||
case S_IFREG:
|
|
||||||
err = ovl_do_create(dir, newdentry, stat->mode, debug);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case S_IFDIR:
|
|
||||||
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case S_IFCHR:
|
|
||||||
case S_IFBLK:
|
|
||||||
case S_IFIFO:
|
|
||||||
case S_IFSOCK:
|
|
||||||
err = ovl_do_mknod(dir, newdentry,
|
|
||||||
stat->mode, stat->rdev, debug);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case S_IFLNK:
|
|
||||||
err = ovl_do_symlink(dir, newdentry, link, debug);
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
err = -EPERM;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!err && WARN_ON(!newdentry->d_inode)) {
|
|
||||||
/*
|
|
||||||
* Not quite sure if non-instantiated dentry is legal or not.
|
|
||||||
* VFS doesn't seem to care so check and warn here.
|
|
||||||
*/
|
|
||||||
err = -ENOENT;
|
|
||||||
}
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_set_opaque(struct dentry *upperdentry)
|
|
||||||
{
|
|
||||||
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ovl_remove_opaque(struct dentry *upperdentry)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
|
|
||||||
if (err) {
|
|
||||||
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
|
|
||||||
upperdentry->d_name.name, err);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
|
||||||
struct kstat *stat)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
enum ovl_path_type type;
|
|
||||||
struct path realpath;
|
|
||||||
|
|
||||||
type = ovl_path_real(dentry, &realpath);
|
|
||||||
err = vfs_getattr(&realpath, stat);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
stat->dev = dentry->d_sb->s_dev;
|
|
||||||
stat->ino = dentry->d_inode->i_ino;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* It's probably not worth it to count subdirs to get the
|
|
||||||
* correct link count. nlink=1 seems to pacify 'find' and
|
|
||||||
* other utilities.
|
|
||||||
*/
|
|
||||||
if (OVL_TYPE_MERGE(type))
|
|
||||||
stat->nlink = 1;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
|
|
||||||
struct kstat *stat, const char *link,
|
|
||||||
struct dentry *hardlink)
|
|
||||||
{
|
|
||||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
|
||||||
struct inode *udir = upperdir->d_inode;
|
|
||||||
struct dentry *newdentry;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
|
|
||||||
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
|
|
||||||
dentry->d_name.len);
|
|
||||||
err = PTR_ERR(newdentry);
|
|
||||||
if (IS_ERR(newdentry))
|
|
||||||
goto out_unlock;
|
|
||||||
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
|
|
||||||
if (err)
|
|
||||||
goto out_dput;
|
|
||||||
|
|
||||||
ovl_dentry_version_inc(dentry->d_parent);
|
|
||||||
ovl_dentry_update(dentry, newdentry);
|
|
||||||
ovl_copyattr(newdentry->d_inode, inode);
|
|
||||||
d_instantiate(dentry, inode);
|
|
||||||
newdentry = NULL;
|
|
||||||
out_dput:
|
|
||||||
dput(newdentry);
|
|
||||||
out_unlock:
|
|
||||||
mutex_unlock(&udir->i_mutex);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_lock_rename_workdir(struct dentry *workdir,
|
|
||||||
struct dentry *upperdir)
|
|
||||||
{
|
|
||||||
/* Workdir should not be the same as upperdir */
|
|
||||||
if (workdir == upperdir)
|
|
||||||
goto err;
|
|
||||||
|
|
||||||
/* Workdir should not be subdir of upperdir and vice versa */
|
|
||||||
if (lock_rename(workdir, upperdir) != NULL)
|
|
||||||
goto err_unlock;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
err_unlock:
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
err:
|
|
||||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
|
||||||
return -EIO;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct dentry *ovl_clear_empty(struct dentry *dentry,
|
|
||||||
struct list_head *list)
|
|
||||||
{
|
|
||||||
struct dentry *workdir = ovl_workdir(dentry);
|
|
||||||
struct inode *wdir = workdir->d_inode;
|
|
||||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
|
||||||
struct inode *udir = upperdir->d_inode;
|
|
||||||
struct path upperpath;
|
|
||||||
struct dentry *upper;
|
|
||||||
struct dentry *opaquedir;
|
|
||||||
struct kstat stat;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (WARN_ON(!workdir))
|
|
||||||
return ERR_PTR(-EROFS);
|
|
||||||
|
|
||||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
ovl_path_upper(dentry, &upperpath);
|
|
||||||
err = vfs_getattr(&upperpath, &stat);
|
|
||||||
if (err)
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
err = -ESTALE;
|
|
||||||
if (!S_ISDIR(stat.mode))
|
|
||||||
goto out_unlock;
|
|
||||||
upper = upperpath.dentry;
|
|
||||||
if (upper->d_parent->d_inode != udir)
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
opaquedir = ovl_lookup_temp(workdir, dentry);
|
|
||||||
err = PTR_ERR(opaquedir);
|
|
||||||
if (IS_ERR(opaquedir))
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
|
|
||||||
if (err)
|
|
||||||
goto out_dput;
|
|
||||||
|
|
||||||
err = ovl_copy_xattr(upper, opaquedir);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
err = ovl_set_opaque(opaquedir);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
mutex_lock(&opaquedir->d_inode->i_mutex);
|
|
||||||
err = ovl_set_attr(opaquedir, &stat);
|
|
||||||
mutex_unlock(&opaquedir->d_inode->i_mutex);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
ovl_cleanup_whiteouts(upper, list);
|
|
||||||
ovl_cleanup(wdir, upper);
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
|
|
||||||
/* dentry's upper doesn't match now, get rid of it */
|
|
||||||
d_drop(dentry);
|
|
||||||
|
|
||||||
return opaquedir;
|
|
||||||
|
|
||||||
out_cleanup:
|
|
||||||
ovl_cleanup(wdir, opaquedir);
|
|
||||||
out_dput:
|
|
||||||
dput(opaquedir);
|
|
||||||
out_unlock:
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
out:
|
|
||||||
return ERR_PTR(err);
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *ret = NULL;
|
|
||||||
LIST_HEAD(list);
|
|
||||||
|
|
||||||
err = ovl_check_empty_dir(dentry, &list);
|
|
||||||
if (err)
|
|
||||||
ret = ERR_PTR(err);
|
|
||||||
else {
|
|
||||||
/*
|
|
||||||
* If no upperdentry then skip clearing whiteouts.
|
|
||||||
*
|
|
||||||
* Can race with copy-up, since we don't hold the upperdir
|
|
||||||
* mutex. Doesn't matter, since copy-up can't create a
|
|
||||||
* non-empty directory from an empty one.
|
|
||||||
*/
|
|
||||||
if (ovl_dentry_upper(dentry))
|
|
||||||
ret = ovl_clear_empty(dentry, &list);
|
|
||||||
}
|
|
||||||
|
|
||||||
ovl_cache_free(&list);
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
|
|
||||||
struct kstat *stat, const char *link,
|
|
||||||
struct dentry *hardlink)
|
|
||||||
{
|
|
||||||
struct dentry *workdir = ovl_workdir(dentry);
|
|
||||||
struct inode *wdir = workdir->d_inode;
|
|
||||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
|
||||||
struct inode *udir = upperdir->d_inode;
|
|
||||||
struct dentry *upper;
|
|
||||||
struct dentry *newdentry;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (WARN_ON(!workdir))
|
|
||||||
return -EROFS;
|
|
||||||
|
|
||||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
|
||||||
err = PTR_ERR(newdentry);
|
|
||||||
if (IS_ERR(newdentry))
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
|
||||||
dentry->d_name.len);
|
|
||||||
err = PTR_ERR(upper);
|
|
||||||
if (IS_ERR(upper))
|
|
||||||
goto out_dput;
|
|
||||||
|
|
||||||
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
|
|
||||||
if (err)
|
|
||||||
goto out_dput2;
|
|
||||||
|
|
||||||
if (S_ISDIR(stat->mode)) {
|
|
||||||
err = ovl_set_opaque(newdentry);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
err = ovl_do_rename(wdir, newdentry, udir, upper,
|
|
||||||
RENAME_EXCHANGE);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
ovl_cleanup(wdir, upper);
|
|
||||||
} else {
|
|
||||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
}
|
|
||||||
ovl_dentry_version_inc(dentry->d_parent);
|
|
||||||
ovl_dentry_update(dentry, newdentry);
|
|
||||||
ovl_copyattr(newdentry->d_inode, inode);
|
|
||||||
d_instantiate(dentry, inode);
|
|
||||||
newdentry = NULL;
|
|
||||||
out_dput2:
|
|
||||||
dput(upper);
|
|
||||||
out_dput:
|
|
||||||
dput(newdentry);
|
|
||||||
out_unlock:
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
|
|
||||||
out_cleanup:
|
|
||||||
ovl_cleanup(wdir, newdentry);
|
|
||||||
goto out_dput2;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
|
|
||||||
const char *link, struct dentry *hardlink)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct inode *inode;
|
|
||||||
struct kstat stat = {
|
|
||||||
.mode = mode,
|
|
||||||
.rdev = rdev,
|
|
||||||
};
|
|
||||||
|
|
||||||
err = -ENOMEM;
|
|
||||||
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
|
|
||||||
if (!inode)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = ovl_copy_up(dentry->d_parent);
|
|
||||||
if (err)
|
|
||||||
goto out_iput;
|
|
||||||
|
|
||||||
if (!ovl_dentry_is_opaque(dentry)) {
|
|
||||||
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
|
|
||||||
} else {
|
|
||||||
const struct cred *old_cred;
|
|
||||||
struct cred *override_cred;
|
|
||||||
|
|
||||||
err = -ENOMEM;
|
|
||||||
override_cred = prepare_creds();
|
|
||||||
if (!override_cred)
|
|
||||||
goto out_iput;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* CAP_SYS_ADMIN for setting opaque xattr
|
|
||||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
|
||||||
* CAP_FOWNER for removing whiteout from sticky dir
|
|
||||||
*/
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
|
||||||
old_cred = override_creds(override_cred);
|
|
||||||
|
|
||||||
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
|
|
||||||
hardlink);
|
|
||||||
|
|
||||||
revert_creds(old_cred);
|
|
||||||
put_cred(override_cred);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!err)
|
|
||||||
inode = NULL;
|
|
||||||
out_iput:
|
|
||||||
iput(inode);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
|
|
||||||
const char *link)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (!err) {
|
|
||||||
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
|
|
||||||
bool excl)
|
|
||||||
{
|
|
||||||
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
|
||||||
{
|
|
||||||
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
|
|
||||||
dev_t rdev)
|
|
||||||
{
|
|
||||||
/* Don't allow creation of "whiteout" on overlay */
|
|
||||||
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
|
|
||||||
return -EPERM;
|
|
||||||
|
|
||||||
return ovl_create_object(dentry, mode, rdev, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
|
|
||||||
const char *link)
|
|
||||||
{
|
|
||||||
return ovl_create_object(dentry, S_IFLNK, 0, link);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_link(struct dentry *old, struct inode *newdir,
|
|
||||||
struct dentry *new)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *upper;
|
|
||||||
|
|
||||||
err = ovl_want_write(old);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = ovl_copy_up(old);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
upper = ovl_dentry_upper(old);
|
|
||||||
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
|
|
||||||
|
|
||||||
out_drop_write:
|
|
||||||
ovl_drop_write(old);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
|
|
||||||
{
|
|
||||||
struct dentry *workdir = ovl_workdir(dentry);
|
|
||||||
struct inode *wdir = workdir->d_inode;
|
|
||||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
|
||||||
struct inode *udir = upperdir->d_inode;
|
|
||||||
struct dentry *whiteout;
|
|
||||||
struct dentry *upper;
|
|
||||||
struct dentry *opaquedir = NULL;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (WARN_ON(!workdir))
|
|
||||||
return -EROFS;
|
|
||||||
|
|
||||||
if (is_dir) {
|
|
||||||
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
|
|
||||||
opaquedir = ovl_check_empty_and_clear(dentry);
|
|
||||||
err = PTR_ERR(opaquedir);
|
|
||||||
if (IS_ERR(opaquedir))
|
|
||||||
goto out;
|
|
||||||
} else {
|
|
||||||
LIST_HEAD(list);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* When removing an empty opaque directory, then it
|
|
||||||
* makes no sense to replace it with an exact replica of
|
|
||||||
* itself. But emptiness still needs to be checked.
|
|
||||||
*/
|
|
||||||
err = ovl_check_empty_dir(dentry, &list);
|
|
||||||
ovl_cache_free(&list);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
|
||||||
if (err)
|
|
||||||
goto out_dput;
|
|
||||||
|
|
||||||
whiteout = ovl_whiteout(workdir, dentry);
|
|
||||||
err = PTR_ERR(whiteout);
|
|
||||||
if (IS_ERR(whiteout))
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
upper = ovl_dentry_upper(dentry);
|
|
||||||
if (!upper) {
|
|
||||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
|
||||||
dentry->d_name.len);
|
|
||||||
err = PTR_ERR(upper);
|
|
||||||
if (IS_ERR(upper))
|
|
||||||
goto kill_whiteout;
|
|
||||||
|
|
||||||
err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
|
|
||||||
dput(upper);
|
|
||||||
if (err)
|
|
||||||
goto kill_whiteout;
|
|
||||||
} else {
|
|
||||||
int flags = 0;
|
|
||||||
|
|
||||||
if (opaquedir)
|
|
||||||
upper = opaquedir;
|
|
||||||
err = -ESTALE;
|
|
||||||
if (upper->d_parent != upperdir)
|
|
||||||
goto kill_whiteout;
|
|
||||||
|
|
||||||
if (is_dir)
|
|
||||||
flags |= RENAME_EXCHANGE;
|
|
||||||
|
|
||||||
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
|
|
||||||
if (err)
|
|
||||||
goto kill_whiteout;
|
|
||||||
|
|
||||||
if (is_dir)
|
|
||||||
ovl_cleanup(wdir, upper);
|
|
||||||
}
|
|
||||||
ovl_dentry_version_inc(dentry->d_parent);
|
|
||||||
out_d_drop:
|
|
||||||
d_drop(dentry);
|
|
||||||
dput(whiteout);
|
|
||||||
out_unlock:
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
out_dput:
|
|
||||||
dput(opaquedir);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
|
|
||||||
kill_whiteout:
|
|
||||||
ovl_cleanup(wdir, whiteout);
|
|
||||||
goto out_d_drop;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
|
|
||||||
{
|
|
||||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
|
||||||
struct inode *dir = upperdir->d_inode;
|
|
||||||
struct dentry *upper = ovl_dentry_upper(dentry);
|
|
||||||
int err;
|
|
||||||
|
|
||||||
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
|
|
||||||
err = -ESTALE;
|
|
||||||
if (upper->d_parent == upperdir) {
|
|
||||||
/* Don't let d_delete() think it can reset d_inode */
|
|
||||||
dget(upper);
|
|
||||||
if (is_dir)
|
|
||||||
err = vfs_rmdir(dir, upper);
|
|
||||||
else
|
|
||||||
err = vfs_unlink(dir, upper, NULL);
|
|
||||||
dput(upper);
|
|
||||||
ovl_dentry_version_inc(dentry->d_parent);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Keeping this dentry hashed would mean having to release
|
|
||||||
* upperpath/lowerpath, which could only be done if we are the
|
|
||||||
* sole user of this dentry. Too tricky... Just unhash for
|
|
||||||
* now.
|
|
||||||
*/
|
|
||||||
d_drop(dentry);
|
|
||||||
mutex_unlock(&dir->i_mutex);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_check_sticky(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
|
|
||||||
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
|
|
||||||
|
|
||||||
if (check_sticky(dir, inode))
|
|
||||||
return -EPERM;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
|
|
||||||
{
|
|
||||||
enum ovl_path_type type;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
err = ovl_check_sticky(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = ovl_copy_up(dentry->d_parent);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
type = ovl_path_type(dentry);
|
|
||||||
if (OVL_TYPE_PURE_UPPER(type)) {
|
|
||||||
err = ovl_remove_upper(dentry, is_dir);
|
|
||||||
} else {
|
|
||||||
const struct cred *old_cred;
|
|
||||||
struct cred *override_cred;
|
|
||||||
|
|
||||||
err = -ENOMEM;
|
|
||||||
override_cred = prepare_creds();
|
|
||||||
if (!override_cred)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
|
||||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
|
||||||
* CAP_FOWNER for removing whiteout from sticky dir
|
|
||||||
* CAP_FSETID for chmod of opaque dir
|
|
||||||
* CAP_CHOWN for chown of opaque dir
|
|
||||||
*/
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
|
||||||
old_cred = override_creds(override_cred);
|
|
||||||
|
|
||||||
err = ovl_remove_and_whiteout(dentry, is_dir);
|
|
||||||
|
|
||||||
revert_creds(old_cred);
|
|
||||||
put_cred(override_cred);
|
|
||||||
}
|
|
||||||
out_drop_write:
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return ovl_do_remove(dentry, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return ovl_do_remove(dentry, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_rename2(struct inode *olddir, struct dentry *old,
|
|
||||||
struct inode *newdir, struct dentry *new,
|
|
||||||
unsigned int flags)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
enum ovl_path_type old_type;
|
|
||||||
enum ovl_path_type new_type;
|
|
||||||
struct dentry *old_upperdir;
|
|
||||||
struct dentry *new_upperdir;
|
|
||||||
struct dentry *olddentry;
|
|
||||||
struct dentry *newdentry;
|
|
||||||
struct dentry *trap;
|
|
||||||
bool old_opaque;
|
|
||||||
bool new_opaque;
|
|
||||||
bool new_create = false;
|
|
||||||
bool cleanup_whiteout = false;
|
|
||||||
bool overwrite = !(flags & RENAME_EXCHANGE);
|
|
||||||
bool is_dir = d_is_dir(old);
|
|
||||||
bool new_is_dir = false;
|
|
||||||
struct dentry *opaquedir = NULL;
|
|
||||||
const struct cred *old_cred = NULL;
|
|
||||||
struct cred *override_cred = NULL;
|
|
||||||
|
|
||||||
err = -EINVAL;
|
|
||||||
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
flags &= ~RENAME_NOREPLACE;
|
|
||||||
|
|
||||||
err = ovl_check_sticky(old);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
/* Don't copy up directory trees */
|
|
||||||
old_type = ovl_path_type(old);
|
|
||||||
err = -EXDEV;
|
|
||||||
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
if (new->d_inode) {
|
|
||||||
err = ovl_check_sticky(new);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
if (d_is_dir(new))
|
|
||||||
new_is_dir = true;
|
|
||||||
|
|
||||||
new_type = ovl_path_type(new);
|
|
||||||
err = -EXDEV;
|
|
||||||
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = 0;
|
|
||||||
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
|
|
||||||
if (ovl_dentry_lower(old)->d_inode ==
|
|
||||||
ovl_dentry_lower(new)->d_inode)
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
|
|
||||||
if (ovl_dentry_upper(old)->d_inode ==
|
|
||||||
ovl_dentry_upper(new)->d_inode)
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (ovl_dentry_is_opaque(new))
|
|
||||||
new_type = __OVL_PATH_UPPER;
|
|
||||||
else
|
|
||||||
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ovl_want_write(old);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = ovl_copy_up(old);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
err = ovl_copy_up(new->d_parent);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
if (!overwrite) {
|
|
||||||
err = ovl_copy_up(new);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
}
|
|
||||||
|
|
||||||
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
|
|
||||||
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
|
|
||||||
|
|
||||||
if (old_opaque || new_opaque) {
|
|
||||||
err = -ENOMEM;
|
|
||||||
override_cred = prepare_creds();
|
|
||||||
if (!override_cred)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
|
||||||
* CAP_DAC_OVERRIDE for create in workdir
|
|
||||||
* CAP_FOWNER for removing whiteout from sticky dir
|
|
||||||
* CAP_FSETID for chmod of opaque dir
|
|
||||||
* CAP_CHOWN for chown of opaque dir
|
|
||||||
*/
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
|
||||||
old_cred = override_creds(override_cred);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
|
|
||||||
opaquedir = ovl_check_empty_and_clear(new);
|
|
||||||
err = PTR_ERR(opaquedir);
|
|
||||||
if (IS_ERR(opaquedir)) {
|
|
||||||
opaquedir = NULL;
|
|
||||||
goto out_revert_creds;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (overwrite) {
|
|
||||||
if (old_opaque) {
|
|
||||||
if (new->d_inode || !new_opaque) {
|
|
||||||
/* Whiteout source */
|
|
||||||
flags |= RENAME_WHITEOUT;
|
|
||||||
} else {
|
|
||||||
/* Switch whiteouts */
|
|
||||||
flags |= RENAME_EXCHANGE;
|
|
||||||
}
|
|
||||||
} else if (is_dir && !new->d_inode && new_opaque) {
|
|
||||||
flags |= RENAME_EXCHANGE;
|
|
||||||
cleanup_whiteout = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
old_upperdir = ovl_dentry_upper(old->d_parent);
|
|
||||||
new_upperdir = ovl_dentry_upper(new->d_parent);
|
|
||||||
|
|
||||||
trap = lock_rename(new_upperdir, old_upperdir);
|
|
||||||
|
|
||||||
olddentry = ovl_dentry_upper(old);
|
|
||||||
newdentry = ovl_dentry_upper(new);
|
|
||||||
if (newdentry) {
|
|
||||||
if (opaquedir) {
|
|
||||||
newdentry = opaquedir;
|
|
||||||
opaquedir = NULL;
|
|
||||||
} else {
|
|
||||||
dget(newdentry);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
new_create = true;
|
|
||||||
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
|
|
||||||
new->d_name.len);
|
|
||||||
err = PTR_ERR(newdentry);
|
|
||||||
if (IS_ERR(newdentry))
|
|
||||||
goto out_unlock;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = -ESTALE;
|
|
||||||
if (olddentry->d_parent != old_upperdir)
|
|
||||||
goto out_dput;
|
|
||||||
if (newdentry->d_parent != new_upperdir)
|
|
||||||
goto out_dput;
|
|
||||||
if (olddentry == trap)
|
|
||||||
goto out_dput;
|
|
||||||
if (newdentry == trap)
|
|
||||||
goto out_dput;
|
|
||||||
|
|
||||||
if (is_dir && !old_opaque && new_opaque) {
|
|
||||||
err = ovl_set_opaque(olddentry);
|
|
||||||
if (err)
|
|
||||||
goto out_dput;
|
|
||||||
}
|
|
||||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
|
|
||||||
err = ovl_set_opaque(newdentry);
|
|
||||||
if (err)
|
|
||||||
goto out_dput;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (old_opaque || new_opaque) {
|
|
||||||
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
|
|
||||||
new_upperdir->d_inode, newdentry,
|
|
||||||
flags);
|
|
||||||
} else {
|
|
||||||
/* No debug for the plain case */
|
|
||||||
BUG_ON(flags & ~RENAME_EXCHANGE);
|
|
||||||
err = vfs_rename(old_upperdir->d_inode, olddentry,
|
|
||||||
new_upperdir->d_inode, newdentry,
|
|
||||||
NULL, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (err) {
|
|
||||||
if (is_dir && !old_opaque && new_opaque)
|
|
||||||
ovl_remove_opaque(olddentry);
|
|
||||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
|
|
||||||
ovl_remove_opaque(newdentry);
|
|
||||||
goto out_dput;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (is_dir && old_opaque && !new_opaque)
|
|
||||||
ovl_remove_opaque(olddentry);
|
|
||||||
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
|
|
||||||
ovl_remove_opaque(newdentry);
|
|
||||||
|
|
||||||
if (old_opaque != new_opaque) {
|
|
||||||
ovl_dentry_set_opaque(old, new_opaque);
|
|
||||||
if (!overwrite)
|
|
||||||
ovl_dentry_set_opaque(new, old_opaque);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cleanup_whiteout)
|
|
||||||
ovl_cleanup(old_upperdir->d_inode, newdentry);
|
|
||||||
|
|
||||||
ovl_dentry_version_inc(old->d_parent);
|
|
||||||
ovl_dentry_version_inc(new->d_parent);
|
|
||||||
|
|
||||||
out_dput:
|
|
||||||
dput(newdentry);
|
|
||||||
out_unlock:
|
|
||||||
unlock_rename(new_upperdir, old_upperdir);
|
|
||||||
out_revert_creds:
|
|
||||||
if (old_opaque || new_opaque) {
|
|
||||||
revert_creds(old_cred);
|
|
||||||
put_cred(override_cred);
|
|
||||||
}
|
|
||||||
out_drop_write:
|
|
||||||
ovl_drop_write(old);
|
|
||||||
out:
|
|
||||||
dput(opaquedir);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
const struct inode_operations ovl_dir_inode_operations = {
|
|
||||||
.lookup = ovl_lookup,
|
|
||||||
.mkdir = ovl_mkdir,
|
|
||||||
.symlink = ovl_symlink,
|
|
||||||
.unlink = ovl_unlink,
|
|
||||||
.rmdir = ovl_rmdir,
|
|
||||||
.rename2 = ovl_rename2,
|
|
||||||
.link = ovl_link,
|
|
||||||
.setattr = ovl_setattr,
|
|
||||||
.create = ovl_create,
|
|
||||||
.mknod = ovl_mknod,
|
|
||||||
.permission = ovl_permission,
|
|
||||||
.getattr = ovl_dir_getattr,
|
|
||||||
.setxattr = ovl_setxattr,
|
|
||||||
.getxattr = ovl_getxattr,
|
|
||||||
.listxattr = ovl_listxattr,
|
|
||||||
.removexattr = ovl_removexattr,
|
|
||||||
};
|
|
||||||
@@ -1,438 +0,0 @@
|
|||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/fs.h>
|
|
||||||
#include <linux/slab.h>
|
|
||||||
#include <linux/xattr.h>
|
|
||||||
#include "overlayfs.h"
|
|
||||||
|
|
||||||
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
|
|
||||||
bool no_data)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *parent;
|
|
||||||
struct kstat stat;
|
|
||||||
struct path lowerpath;
|
|
||||||
|
|
||||||
parent = dget_parent(dentry);
|
|
||||||
err = ovl_copy_up(parent);
|
|
||||||
if (err)
|
|
||||||
goto out_dput_parent;
|
|
||||||
|
|
||||||
ovl_path_lower(dentry, &lowerpath);
|
|
||||||
err = vfs_getattr(&lowerpath, &stat);
|
|
||||||
if (err)
|
|
||||||
goto out_dput_parent;
|
|
||||||
|
|
||||||
if (no_data)
|
|
||||||
stat.size = 0;
|
|
||||||
|
|
||||||
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
|
|
||||||
|
|
||||||
out_dput_parent:
|
|
||||||
dput(parent);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *upperdentry;
|
|
||||||
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
upperdentry = ovl_dentry_upper(dentry);
|
|
||||||
if (upperdentry) {
|
|
||||||
mutex_lock(&upperdentry->d_inode->i_mutex);
|
|
||||||
err = notify_change(upperdentry, attr, NULL);
|
|
||||||
mutex_unlock(&upperdentry->d_inode->i_mutex);
|
|
||||||
} else {
|
|
||||||
err = ovl_copy_up_last(dentry, attr, false);
|
|
||||||
}
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
|
||||||
struct kstat *stat)
|
|
||||||
{
|
|
||||||
struct path realpath;
|
|
||||||
|
|
||||||
ovl_path_real(dentry, &realpath);
|
|
||||||
return vfs_getattr(&realpath, stat);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_permission(struct inode *inode, int mask)
|
|
||||||
{
|
|
||||||
struct ovl_entry *oe;
|
|
||||||
struct dentry *alias = NULL;
|
|
||||||
struct inode *realinode;
|
|
||||||
struct dentry *realdentry;
|
|
||||||
bool is_upper;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (S_ISDIR(inode->i_mode)) {
|
|
||||||
oe = inode->i_private;
|
|
||||||
} else if (mask & MAY_NOT_BLOCK) {
|
|
||||||
return -ECHILD;
|
|
||||||
} else {
|
|
||||||
/*
|
|
||||||
* For non-directories find an alias and get the info
|
|
||||||
* from there.
|
|
||||||
*/
|
|
||||||
alias = d_find_any_alias(inode);
|
|
||||||
if (WARN_ON(!alias))
|
|
||||||
return -ENOENT;
|
|
||||||
|
|
||||||
oe = alias->d_fsdata;
|
|
||||||
}
|
|
||||||
|
|
||||||
realdentry = ovl_entry_real(oe, &is_upper);
|
|
||||||
|
|
||||||
/* Careful in RCU walk mode */
|
|
||||||
realinode = ACCESS_ONCE(realdentry->d_inode);
|
|
||||||
if (!realinode) {
|
|
||||||
WARN_ON(!(mask & MAY_NOT_BLOCK));
|
|
||||||
err = -ENOENT;
|
|
||||||
goto out_dput;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (mask & MAY_WRITE) {
|
|
||||||
umode_t mode = realinode->i_mode;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Writes will always be redirected to upper layer, so
|
|
||||||
* ignore lower layer being read-only.
|
|
||||||
*
|
|
||||||
* If the overlay itself is read-only then proceed
|
|
||||||
* with the permission check, don't return EROFS.
|
|
||||||
* This will only happen if this is the lower layer of
|
|
||||||
* another overlayfs.
|
|
||||||
*
|
|
||||||
* If upper fs becomes read-only after the overlay was
|
|
||||||
* constructed return EROFS to prevent modification of
|
|
||||||
* upper layer.
|
|
||||||
*/
|
|
||||||
err = -EROFS;
|
|
||||||
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
|
|
||||||
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
|
|
||||||
goto out_dput;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = __inode_permission(realinode, mask);
|
|
||||||
out_dput:
|
|
||||||
dput(alias);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
struct ovl_link_data {
|
|
||||||
struct dentry *realdentry;
|
|
||||||
void *cookie;
|
|
||||||
};
|
|
||||||
|
|
||||||
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
|
|
||||||
{
|
|
||||||
void *ret;
|
|
||||||
struct dentry *realdentry;
|
|
||||||
struct inode *realinode;
|
|
||||||
|
|
||||||
realdentry = ovl_dentry_real(dentry);
|
|
||||||
realinode = realdentry->d_inode;
|
|
||||||
|
|
||||||
if (WARN_ON(!realinode->i_op->follow_link))
|
|
||||||
return ERR_PTR(-EPERM);
|
|
||||||
|
|
||||||
ret = realinode->i_op->follow_link(realdentry, nd);
|
|
||||||
if (IS_ERR(ret))
|
|
||||||
return ret;
|
|
||||||
|
|
||||||
if (realinode->i_op->put_link) {
|
|
||||||
struct ovl_link_data *data;
|
|
||||||
|
|
||||||
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
|
|
||||||
if (!data) {
|
|
||||||
realinode->i_op->put_link(realdentry, nd, ret);
|
|
||||||
return ERR_PTR(-ENOMEM);
|
|
||||||
}
|
|
||||||
data->realdentry = realdentry;
|
|
||||||
data->cookie = ret;
|
|
||||||
|
|
||||||
return data;
|
|
||||||
} else {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
|
|
||||||
{
|
|
||||||
struct inode *realinode;
|
|
||||||
struct ovl_link_data *data = c;
|
|
||||||
|
|
||||||
if (!data)
|
|
||||||
return;
|
|
||||||
|
|
||||||
realinode = data->realdentry->d_inode;
|
|
||||||
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
|
|
||||||
kfree(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
|
|
||||||
{
|
|
||||||
struct path realpath;
|
|
||||||
struct inode *realinode;
|
|
||||||
|
|
||||||
ovl_path_real(dentry, &realpath);
|
|
||||||
realinode = realpath.dentry->d_inode;
|
|
||||||
|
|
||||||
if (!realinode->i_op->readlink)
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
touch_atime(&realpath);
|
|
||||||
|
|
||||||
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static bool ovl_is_private_xattr(const char *name)
|
|
||||||
{
|
|
||||||
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
|
||||||
const void *value, size_t size, int flags)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *upperdentry;
|
|
||||||
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = -EPERM;
|
|
||||||
if (ovl_is_private_xattr(name))
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
err = ovl_copy_up(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
upperdentry = ovl_dentry_upper(dentry);
|
|
||||||
err = vfs_setxattr(upperdentry, name, value, size, flags);
|
|
||||||
|
|
||||||
out_drop_write:
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ovl_need_xattr_filter(struct dentry *dentry,
|
|
||||||
enum ovl_path_type type)
|
|
||||||
{
|
|
||||||
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
|
|
||||||
return S_ISDIR(dentry->d_inode->i_mode);
|
|
||||||
else
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
|
||||||
void *value, size_t size)
|
|
||||||
{
|
|
||||||
struct path realpath;
|
|
||||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
|
||||||
|
|
||||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
|
||||||
return -ENODATA;
|
|
||||||
|
|
||||||
return vfs_getxattr(realpath.dentry, name, value, size);
|
|
||||||
}
|
|
||||||
|
|
||||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
|
|
||||||
{
|
|
||||||
struct path realpath;
|
|
||||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
|
||||||
ssize_t res;
|
|
||||||
int off;
|
|
||||||
|
|
||||||
res = vfs_listxattr(realpath.dentry, list, size);
|
|
||||||
if (res <= 0 || size == 0)
|
|
||||||
return res;
|
|
||||||
|
|
||||||
if (!ovl_need_xattr_filter(dentry, type))
|
|
||||||
return res;
|
|
||||||
|
|
||||||
/* filter out private xattrs */
|
|
||||||
for (off = 0; off < res;) {
|
|
||||||
char *s = list + off;
|
|
||||||
size_t slen = strlen(s) + 1;
|
|
||||||
|
|
||||||
BUG_ON(off + slen > res);
|
|
||||||
|
|
||||||
if (ovl_is_private_xattr(s)) {
|
|
||||||
res -= slen;
|
|
||||||
memmove(s, s + slen, res - off);
|
|
||||||
} else {
|
|
||||||
off += slen;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_removexattr(struct dentry *dentry, const char *name)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct path realpath;
|
|
||||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
|
||||||
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = -ENODATA;
|
|
||||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
if (!OVL_TYPE_UPPER(type)) {
|
|
||||||
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
|
|
||||||
if (err < 0)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
err = ovl_copy_up(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
ovl_path_upper(dentry, &realpath);
|
|
||||||
}
|
|
||||||
|
|
||||||
err = vfs_removexattr(realpath.dentry, name);
|
|
||||||
out_drop_write:
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
|
|
||||||
struct dentry *realdentry)
|
|
||||||
{
|
|
||||||
if (OVL_TYPE_UPPER(type))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (special_file(realdentry->d_inode->i_mode))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
|
|
||||||
const struct cred *cred)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct path realpath;
|
|
||||||
enum ovl_path_type type;
|
|
||||||
bool want_write = false;
|
|
||||||
|
|
||||||
type = ovl_path_real(dentry, &realpath);
|
|
||||||
if (!ovl_is_nocopyupw(dentry)) {
|
|
||||||
if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
|
|
||||||
want_write = true;
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
if (file->f_flags & O_TRUNC)
|
|
||||||
err = ovl_copy_up_last(dentry, NULL, true);
|
|
||||||
else
|
|
||||||
err = ovl_copy_up(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
ovl_path_upper(dentry, &realpath);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
err = vfs_open(&realpath, file, cred);
|
|
||||||
out_drop_write:
|
|
||||||
if (want_write)
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const struct inode_operations ovl_file_inode_operations = {
|
|
||||||
.setattr = ovl_setattr,
|
|
||||||
.permission = ovl_permission,
|
|
||||||
.getattr = ovl_getattr,
|
|
||||||
.setxattr = ovl_setxattr,
|
|
||||||
.getxattr = ovl_getxattr,
|
|
||||||
.listxattr = ovl_listxattr,
|
|
||||||
.removexattr = ovl_removexattr,
|
|
||||||
.dentry_open = ovl_dentry_open,
|
|
||||||
};
|
|
||||||
|
|
||||||
static const struct inode_operations ovl_symlink_inode_operations = {
|
|
||||||
.setattr = ovl_setattr,
|
|
||||||
.follow_link = ovl_follow_link,
|
|
||||||
.put_link = ovl_put_link,
|
|
||||||
.readlink = ovl_readlink,
|
|
||||||
.getattr = ovl_getattr,
|
|
||||||
.setxattr = ovl_setxattr,
|
|
||||||
.getxattr = ovl_getxattr,
|
|
||||||
.listxattr = ovl_listxattr,
|
|
||||||
.removexattr = ovl_removexattr,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
|
||||||
struct ovl_entry *oe)
|
|
||||||
{
|
|
||||||
struct inode *inode;
|
|
||||||
|
|
||||||
inode = new_inode(sb);
|
|
||||||
if (!inode)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
mode &= S_IFMT;
|
|
||||||
|
|
||||||
inode->i_ino = get_next_ino();
|
|
||||||
inode->i_mode = mode;
|
|
||||||
inode->i_flags |= S_NOATIME | S_NOCMTIME;
|
|
||||||
|
|
||||||
switch (mode) {
|
|
||||||
case S_IFDIR:
|
|
||||||
inode->i_private = oe;
|
|
||||||
inode->i_op = &ovl_dir_inode_operations;
|
|
||||||
inode->i_fop = &ovl_dir_operations;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case S_IFLNK:
|
|
||||||
inode->i_op = &ovl_symlink_inode_operations;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case S_IFREG:
|
|
||||||
case S_IFSOCK:
|
|
||||||
case S_IFBLK:
|
|
||||||
case S_IFCHR:
|
|
||||||
case S_IFIFO:
|
|
||||||
inode->i_op = &ovl_file_inode_operations;
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
WARN(1, "illegal file type: %i\n", mode);
|
|
||||||
iput(inode);
|
|
||||||
inode = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return inode;
|
|
||||||
}
|
|
||||||
@@ -1,200 +0,0 @@
|
|||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/kernel.h>
|
|
||||||
|
|
||||||
struct ovl_entry;
|
|
||||||
|
|
||||||
enum ovl_path_type {
|
|
||||||
__OVL_PATH_PURE = (1 << 0),
|
|
||||||
__OVL_PATH_UPPER = (1 << 1),
|
|
||||||
__OVL_PATH_MERGE = (1 << 2),
|
|
||||||
};
|
|
||||||
|
|
||||||
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
|
|
||||||
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
|
|
||||||
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
|
|
||||||
#define OVL_TYPE_MERGE_OR_LOWER(type) \
|
|
||||||
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
|
|
||||||
|
|
||||||
#define OVL_XATTR_PRE_NAME "trusted.overlay."
|
|
||||||
#define OVL_XATTR_PRE_LEN 16
|
|
||||||
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
|
|
||||||
|
|
||||||
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err = vfs_rmdir(dir, dentry);
|
|
||||||
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err = vfs_unlink(dir, dentry, NULL);
|
|
||||||
pr_debug("unlink(%pd2) = %i\n", dentry, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
|
|
||||||
struct dentry *new_dentry, bool debug)
|
|
||||||
{
|
|
||||||
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
|
|
||||||
if (debug) {
|
|
||||||
pr_debug("link(%pd2, %pd2) = %i\n",
|
|
||||||
old_dentry, new_dentry, err);
|
|
||||||
}
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
|
|
||||||
umode_t mode, bool debug)
|
|
||||||
{
|
|
||||||
int err = vfs_create(dir, dentry, mode, true);
|
|
||||||
if (debug)
|
|
||||||
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
|
|
||||||
umode_t mode, bool debug)
|
|
||||||
{
|
|
||||||
int err = vfs_mkdir(dir, dentry, mode);
|
|
||||||
if (debug)
|
|
||||||
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
|
|
||||||
umode_t mode, dev_t dev, bool debug)
|
|
||||||
{
|
|
||||||
int err = vfs_mknod(dir, dentry, mode, dev);
|
|
||||||
if (debug) {
|
|
||||||
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
|
|
||||||
dentry, mode, dev, err);
|
|
||||||
}
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
|
|
||||||
const char *oldname, bool debug)
|
|
||||||
{
|
|
||||||
int err = vfs_symlink(dir, dentry, oldname);
|
|
||||||
if (debug)
|
|
||||||
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
|
|
||||||
const void *value, size_t size, int flags)
|
|
||||||
{
|
|
||||||
int err = vfs_setxattr(dentry, name, value, size, flags);
|
|
||||||
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
|
|
||||||
dentry, name, (int) size, (char *) value, flags, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
|
|
||||||
{
|
|
||||||
int err = vfs_removexattr(dentry, name);
|
|
||||||
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
|
|
||||||
struct inode *newdir, struct dentry *newdentry,
|
|
||||||
unsigned int flags)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
|
|
||||||
olddentry, newdentry, flags);
|
|
||||||
|
|
||||||
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
|
|
||||||
|
|
||||||
if (err) {
|
|
||||||
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
|
|
||||||
olddentry, newdentry, err);
|
|
||||||
}
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err = vfs_whiteout(dir, dentry);
|
|
||||||
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ovl_is_nocopyupw(struct dentry *dentry);
|
|
||||||
enum ovl_path_type ovl_path_type(struct dentry *dentry);
|
|
||||||
u64 ovl_dentry_version_get(struct dentry *dentry);
|
|
||||||
void ovl_dentry_version_inc(struct dentry *dentry);
|
|
||||||
void ovl_path_upper(struct dentry *dentry, struct path *path);
|
|
||||||
void ovl_path_lower(struct dentry *dentry, struct path *path);
|
|
||||||
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
|
|
||||||
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
|
|
||||||
struct dentry *ovl_dentry_upper(struct dentry *dentry);
|
|
||||||
struct dentry *ovl_dentry_lower(struct dentry *dentry);
|
|
||||||
struct dentry *ovl_dentry_real(struct dentry *dentry);
|
|
||||||
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
|
|
||||||
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
|
|
||||||
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
|
|
||||||
struct dentry *ovl_workdir(struct dentry *dentry);
|
|
||||||
int ovl_want_write(struct dentry *dentry);
|
|
||||||
void ovl_drop_write(struct dentry *dentry);
|
|
||||||
bool ovl_dentry_is_opaque(struct dentry *dentry);
|
|
||||||
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
|
|
||||||
bool ovl_is_whiteout(struct dentry *dentry);
|
|
||||||
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
|
|
||||||
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
|
|
||||||
unsigned int flags);
|
|
||||||
struct file *ovl_path_open(struct path *path, int flags);
|
|
||||||
|
|
||||||
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
|
|
||||||
struct kstat *stat, const char *link);
|
|
||||||
|
|
||||||
/* readdir.c */
|
|
||||||
extern const struct file_operations ovl_dir_operations;
|
|
||||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
|
|
||||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
|
|
||||||
void ovl_cache_free(struct list_head *list);
|
|
||||||
|
|
||||||
/* inode.c */
|
|
||||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
|
|
||||||
int ovl_permission(struct inode *inode, int mask);
|
|
||||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
|
||||||
const void *value, size_t size, int flags);
|
|
||||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
|
||||||
void *value, size_t size);
|
|
||||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
|
|
||||||
int ovl_removexattr(struct dentry *dentry, const char *name);
|
|
||||||
|
|
||||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
|
||||||
struct ovl_entry *oe);
|
|
||||||
static inline void ovl_copyattr(struct inode *from, struct inode *to)
|
|
||||||
{
|
|
||||||
to->i_uid = from->i_uid;
|
|
||||||
to->i_gid = from->i_gid;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* dir.c */
|
|
||||||
extern const struct inode_operations ovl_dir_inode_operations;
|
|
||||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
|
|
||||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
|
||||||
struct kstat *stat, const char *link,
|
|
||||||
struct dentry *hardlink, bool debug);
|
|
||||||
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
|
|
||||||
|
|
||||||
/* copy_up.c */
|
|
||||||
int ovl_copy_up(struct dentry *dentry);
|
|
||||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
|
||||||
struct path *lowerpath, struct kstat *stat,
|
|
||||||
struct iattr *attr);
|
|
||||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
|
|
||||||
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
|
|
||||||
@@ -1,557 +0,0 @@
|
|||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/fs.h>
|
|
||||||
#include <linux/slab.h>
|
|
||||||
#include <linux/namei.h>
|
|
||||||
#include <linux/file.h>
|
|
||||||
#include <linux/xattr.h>
|
|
||||||
#include <linux/rbtree.h>
|
|
||||||
#include <linux/security.h>
|
|
||||||
#include <linux/cred.h>
|
|
||||||
#include "overlayfs.h"
|
|
||||||
|
|
||||||
struct ovl_cache_entry {
|
|
||||||
unsigned int len;
|
|
||||||
unsigned int type;
|
|
||||||
u64 ino;
|
|
||||||
struct list_head l_node;
|
|
||||||
struct rb_node node;
|
|
||||||
bool is_whiteout;
|
|
||||||
char name[];
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ovl_dir_cache {
|
|
||||||
long refcount;
|
|
||||||
u64 version;
|
|
||||||
struct list_head entries;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ovl_readdir_data {
|
|
||||||
struct dir_context ctx;
|
|
||||||
bool is_merge;
|
|
||||||
struct rb_root root;
|
|
||||||
struct list_head *list;
|
|
||||||
struct list_head middle;
|
|
||||||
struct dentry *dir;
|
|
||||||
int count;
|
|
||||||
int err;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ovl_dir_file {
|
|
||||||
bool is_real;
|
|
||||||
bool is_upper;
|
|
||||||
struct ovl_dir_cache *cache;
|
|
||||||
struct list_head *cursor;
|
|
||||||
struct file *realfile;
|
|
||||||
struct file *upperfile;
|
|
||||||
};
|
|
||||||
|
|
||||||
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
|
|
||||||
{
|
|
||||||
return container_of(n, struct ovl_cache_entry, node);
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
|
|
||||||
const char *name, int len)
|
|
||||||
{
|
|
||||||
struct rb_node *node = root->rb_node;
|
|
||||||
int cmp;
|
|
||||||
|
|
||||||
while (node) {
|
|
||||||
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
|
|
||||||
|
|
||||||
cmp = strncmp(name, p->name, len);
|
|
||||||
if (cmp > 0)
|
|
||||||
node = p->node.rb_right;
|
|
||||||
else if (cmp < 0 || len < p->len)
|
|
||||||
node = p->node.rb_left;
|
|
||||||
else
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
|
|
||||||
const char *name, int len,
|
|
||||||
u64 ino, unsigned int d_type)
|
|
||||||
{
|
|
||||||
struct ovl_cache_entry *p;
|
|
||||||
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
|
|
||||||
|
|
||||||
p = kmalloc(size, GFP_KERNEL);
|
|
||||||
if (!p)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
memcpy(p->name, name, len);
|
|
||||||
p->name[len] = '\0';
|
|
||||||
p->len = len;
|
|
||||||
p->type = d_type;
|
|
||||||
p->ino = ino;
|
|
||||||
p->is_whiteout = false;
|
|
||||||
|
|
||||||
if (d_type == DT_CHR) {
|
|
||||||
struct dentry *dentry;
|
|
||||||
const struct cred *old_cred;
|
|
||||||
struct cred *override_cred;
|
|
||||||
|
|
||||||
override_cred = prepare_creds();
|
|
||||||
if (!override_cred) {
|
|
||||||
kfree(p);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* CAP_DAC_OVERRIDE for lookup
|
|
||||||
*/
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
|
||||||
old_cred = override_creds(override_cred);
|
|
||||||
|
|
||||||
dentry = lookup_one_len(name, dir, len);
|
|
||||||
if (!IS_ERR(dentry)) {
|
|
||||||
p->is_whiteout = ovl_is_whiteout(dentry);
|
|
||||||
dput(dentry);
|
|
||||||
}
|
|
||||||
revert_creds(old_cred);
|
|
||||||
put_cred(override_cred);
|
|
||||||
}
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
|
|
||||||
const char *name, int len, u64 ino,
|
|
||||||
unsigned int d_type)
|
|
||||||
{
|
|
||||||
struct rb_node **newp = &rdd->root.rb_node;
|
|
||||||
struct rb_node *parent = NULL;
|
|
||||||
struct ovl_cache_entry *p;
|
|
||||||
|
|
||||||
while (*newp) {
|
|
||||||
int cmp;
|
|
||||||
struct ovl_cache_entry *tmp;
|
|
||||||
|
|
||||||
parent = *newp;
|
|
||||||
tmp = ovl_cache_entry_from_node(*newp);
|
|
||||||
cmp = strncmp(name, tmp->name, len);
|
|
||||||
if (cmp > 0)
|
|
||||||
newp = &tmp->node.rb_right;
|
|
||||||
else if (cmp < 0 || len < tmp->len)
|
|
||||||
newp = &tmp->node.rb_left;
|
|
||||||
else
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type);
|
|
||||||
if (p == NULL)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
list_add_tail(&p->l_node, rdd->list);
|
|
||||||
rb_link_node(&p->node, parent, newp);
|
|
||||||
rb_insert_color(&p->node, &rdd->root);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
|
|
||||||
const char *name, int namelen,
|
|
||||||
loff_t offset, u64 ino, unsigned int d_type)
|
|
||||||
{
|
|
||||||
struct ovl_cache_entry *p;
|
|
||||||
|
|
||||||
p = ovl_cache_entry_find(&rdd->root, name, namelen);
|
|
||||||
if (p) {
|
|
||||||
list_move_tail(&p->l_node, &rdd->middle);
|
|
||||||
} else {
|
|
||||||
p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type);
|
|
||||||
if (p == NULL)
|
|
||||||
rdd->err = -ENOMEM;
|
|
||||||
else
|
|
||||||
list_add_tail(&p->l_node, &rdd->middle);
|
|
||||||
}
|
|
||||||
|
|
||||||
return rdd->err;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_cache_free(struct list_head *list)
|
|
||||||
{
|
|
||||||
struct ovl_cache_entry *p;
|
|
||||||
struct ovl_cache_entry *n;
|
|
||||||
|
|
||||||
list_for_each_entry_safe(p, n, list, l_node)
|
|
||||||
kfree(p);
|
|
||||||
|
|
||||||
INIT_LIST_HEAD(list);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct ovl_dir_cache *cache = od->cache;
|
|
||||||
|
|
||||||
WARN_ON(cache->refcount <= 0);
|
|
||||||
cache->refcount--;
|
|
||||||
if (!cache->refcount) {
|
|
||||||
if (ovl_dir_cache(dentry) == cache)
|
|
||||||
ovl_set_dir_cache(dentry, NULL);
|
|
||||||
|
|
||||||
ovl_cache_free(&cache->entries);
|
|
||||||
kfree(cache);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_fill_merge(struct dir_context *ctx, const char *name,
|
|
||||||
int namelen, loff_t offset, u64 ino,
|
|
||||||
unsigned int d_type)
|
|
||||||
{
|
|
||||||
struct ovl_readdir_data *rdd =
|
|
||||||
container_of(ctx, struct ovl_readdir_data, ctx);
|
|
||||||
|
|
||||||
rdd->count++;
|
|
||||||
if (!rdd->is_merge)
|
|
||||||
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
|
|
||||||
else
|
|
||||||
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_dir_read(struct path *realpath,
|
|
||||||
struct ovl_readdir_data *rdd)
|
|
||||||
{
|
|
||||||
struct file *realfile;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
|
|
||||||
if (IS_ERR(realfile))
|
|
||||||
return PTR_ERR(realfile);
|
|
||||||
|
|
||||||
rdd->dir = realpath->dentry;
|
|
||||||
rdd->ctx.pos = 0;
|
|
||||||
do {
|
|
||||||
rdd->count = 0;
|
|
||||||
rdd->err = 0;
|
|
||||||
err = iterate_dir(realfile, &rdd->ctx);
|
|
||||||
if (err >= 0)
|
|
||||||
err = rdd->err;
|
|
||||||
} while (!err && rdd->count);
|
|
||||||
fput(realfile);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ovl_dir_reset(struct file *file)
|
|
||||||
{
|
|
||||||
struct ovl_dir_file *od = file->private_data;
|
|
||||||
struct ovl_dir_cache *cache = od->cache;
|
|
||||||
struct dentry *dentry = file->f_path.dentry;
|
|
||||||
enum ovl_path_type type = ovl_path_type(dentry);
|
|
||||||
|
|
||||||
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
|
|
||||||
ovl_cache_put(od, dentry);
|
|
||||||
od->cache = NULL;
|
|
||||||
od->cursor = NULL;
|
|
||||||
}
|
|
||||||
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
|
|
||||||
if (od->is_real && OVL_TYPE_MERGE(type))
|
|
||||||
od->is_real = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct path realpath;
|
|
||||||
struct ovl_readdir_data rdd = {
|
|
||||||
.ctx.actor = ovl_fill_merge,
|
|
||||||
.list = list,
|
|
||||||
.root = RB_ROOT,
|
|
||||||
.is_merge = false,
|
|
||||||
};
|
|
||||||
int idx, next;
|
|
||||||
|
|
||||||
for (idx = 0; idx != -1; idx = next) {
|
|
||||||
next = ovl_path_next(idx, dentry, &realpath);
|
|
||||||
|
|
||||||
if (next != -1) {
|
|
||||||
err = ovl_dir_read(&realpath, &rdd);
|
|
||||||
if (err)
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
/*
|
|
||||||
* Insert lowest layer entries before upper ones, this
|
|
||||||
* allows offsets to be reasonably constant
|
|
||||||
*/
|
|
||||||
list_add(&rdd.middle, rdd.list);
|
|
||||||
rdd.is_merge = true;
|
|
||||||
err = ovl_dir_read(&realpath, &rdd);
|
|
||||||
list_del(&rdd.middle);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
|
|
||||||
{
|
|
||||||
struct list_head *p;
|
|
||||||
loff_t off = 0;
|
|
||||||
|
|
||||||
list_for_each(p, &od->cache->entries) {
|
|
||||||
if (off >= pos)
|
|
||||||
break;
|
|
||||||
off++;
|
|
||||||
}
|
|
||||||
/* Cursor is safe since the cache is stable */
|
|
||||||
od->cursor = p;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int res;
|
|
||||||
struct ovl_dir_cache *cache;
|
|
||||||
|
|
||||||
cache = ovl_dir_cache(dentry);
|
|
||||||
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
|
|
||||||
cache->refcount++;
|
|
||||||
return cache;
|
|
||||||
}
|
|
||||||
ovl_set_dir_cache(dentry, NULL);
|
|
||||||
|
|
||||||
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
|
|
||||||
if (!cache)
|
|
||||||
return ERR_PTR(-ENOMEM);
|
|
||||||
|
|
||||||
cache->refcount = 1;
|
|
||||||
INIT_LIST_HEAD(&cache->entries);
|
|
||||||
|
|
||||||
res = ovl_dir_read_merged(dentry, &cache->entries);
|
|
||||||
if (res) {
|
|
||||||
ovl_cache_free(&cache->entries);
|
|
||||||
kfree(cache);
|
|
||||||
return ERR_PTR(res);
|
|
||||||
}
|
|
||||||
|
|
||||||
cache->version = ovl_dentry_version_get(dentry);
|
|
||||||
ovl_set_dir_cache(dentry, cache);
|
|
||||||
|
|
||||||
return cache;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_iterate(struct file *file, struct dir_context *ctx)
|
|
||||||
{
|
|
||||||
struct ovl_dir_file *od = file->private_data;
|
|
||||||
struct dentry *dentry = file->f_path.dentry;
|
|
||||||
struct ovl_cache_entry *p;
|
|
||||||
|
|
||||||
if (!ctx->pos)
|
|
||||||
ovl_dir_reset(file);
|
|
||||||
|
|
||||||
if (od->is_real)
|
|
||||||
return iterate_dir(od->realfile, ctx);
|
|
||||||
|
|
||||||
if (!od->cache) {
|
|
||||||
struct ovl_dir_cache *cache;
|
|
||||||
|
|
||||||
cache = ovl_cache_get(dentry);
|
|
||||||
if (IS_ERR(cache))
|
|
||||||
return PTR_ERR(cache);
|
|
||||||
|
|
||||||
od->cache = cache;
|
|
||||||
ovl_seek_cursor(od, ctx->pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
while (od->cursor != &od->cache->entries) {
|
|
||||||
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
|
|
||||||
if (!p->is_whiteout)
|
|
||||||
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
|
|
||||||
break;
|
|
||||||
od->cursor = p->l_node.next;
|
|
||||||
ctx->pos++;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
|
|
||||||
{
|
|
||||||
loff_t res;
|
|
||||||
struct ovl_dir_file *od = file->private_data;
|
|
||||||
|
|
||||||
mutex_lock(&file_inode(file)->i_mutex);
|
|
||||||
if (!file->f_pos)
|
|
||||||
ovl_dir_reset(file);
|
|
||||||
|
|
||||||
if (od->is_real) {
|
|
||||||
res = vfs_llseek(od->realfile, offset, origin);
|
|
||||||
file->f_pos = od->realfile->f_pos;
|
|
||||||
} else {
|
|
||||||
res = -EINVAL;
|
|
||||||
|
|
||||||
switch (origin) {
|
|
||||||
case SEEK_CUR:
|
|
||||||
offset += file->f_pos;
|
|
||||||
break;
|
|
||||||
case SEEK_SET:
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
goto out_unlock;
|
|
||||||
}
|
|
||||||
if (offset < 0)
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
if (offset != file->f_pos) {
|
|
||||||
file->f_pos = offset;
|
|
||||||
if (od->cache)
|
|
||||||
ovl_seek_cursor(od, offset);
|
|
||||||
}
|
|
||||||
res = offset;
|
|
||||||
}
|
|
||||||
out_unlock:
|
|
||||||
mutex_unlock(&file_inode(file)->i_mutex);
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
|
|
||||||
int datasync)
|
|
||||||
{
|
|
||||||
struct ovl_dir_file *od = file->private_data;
|
|
||||||
struct dentry *dentry = file->f_path.dentry;
|
|
||||||
struct file *realfile = od->realfile;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Need to check if we started out being a lower dir, but got copied up
|
|
||||||
*/
|
|
||||||
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
|
|
||||||
struct inode *inode = file_inode(file);
|
|
||||||
|
|
||||||
realfile = lockless_dereference(od->upperfile);
|
|
||||||
if (!realfile) {
|
|
||||||
struct path upperpath;
|
|
||||||
|
|
||||||
ovl_path_upper(dentry, &upperpath);
|
|
||||||
realfile = ovl_path_open(&upperpath, O_RDONLY);
|
|
||||||
smp_mb__before_spinlock();
|
|
||||||
mutex_lock(&inode->i_mutex);
|
|
||||||
if (!od->upperfile) {
|
|
||||||
if (IS_ERR(realfile)) {
|
|
||||||
mutex_unlock(&inode->i_mutex);
|
|
||||||
return PTR_ERR(realfile);
|
|
||||||
}
|
|
||||||
od->upperfile = realfile;
|
|
||||||
} else {
|
|
||||||
/* somebody has beaten us to it */
|
|
||||||
if (!IS_ERR(realfile))
|
|
||||||
fput(realfile);
|
|
||||||
realfile = od->upperfile;
|
|
||||||
}
|
|
||||||
mutex_unlock(&inode->i_mutex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return vfs_fsync_range(realfile, start, end, datasync);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_dir_release(struct inode *inode, struct file *file)
|
|
||||||
{
|
|
||||||
struct ovl_dir_file *od = file->private_data;
|
|
||||||
|
|
||||||
if (od->cache) {
|
|
||||||
mutex_lock(&inode->i_mutex);
|
|
||||||
ovl_cache_put(od, file->f_path.dentry);
|
|
||||||
mutex_unlock(&inode->i_mutex);
|
|
||||||
}
|
|
||||||
fput(od->realfile);
|
|
||||||
if (od->upperfile)
|
|
||||||
fput(od->upperfile);
|
|
||||||
kfree(od);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_dir_open(struct inode *inode, struct file *file)
|
|
||||||
{
|
|
||||||
struct path realpath;
|
|
||||||
struct file *realfile;
|
|
||||||
struct ovl_dir_file *od;
|
|
||||||
enum ovl_path_type type;
|
|
||||||
|
|
||||||
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
|
|
||||||
if (!od)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
type = ovl_path_real(file->f_path.dentry, &realpath);
|
|
||||||
realfile = ovl_path_open(&realpath, file->f_flags);
|
|
||||||
if (IS_ERR(realfile)) {
|
|
||||||
kfree(od);
|
|
||||||
return PTR_ERR(realfile);
|
|
||||||
}
|
|
||||||
od->realfile = realfile;
|
|
||||||
od->is_real = !OVL_TYPE_MERGE(type);
|
|
||||||
od->is_upper = OVL_TYPE_UPPER(type);
|
|
||||||
file->private_data = od;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
const struct file_operations ovl_dir_operations = {
|
|
||||||
.read = generic_read_dir,
|
|
||||||
.open = ovl_dir_open,
|
|
||||||
.iterate = ovl_iterate,
|
|
||||||
.llseek = ovl_dir_llseek,
|
|
||||||
.fsync = ovl_dir_fsync,
|
|
||||||
.release = ovl_dir_release,
|
|
||||||
};
|
|
||||||
|
|
||||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct ovl_cache_entry *p;
|
|
||||||
|
|
||||||
err = ovl_dir_read_merged(dentry, list);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
err = 0;
|
|
||||||
|
|
||||||
list_for_each_entry(p, list, l_node) {
|
|
||||||
if (p->is_whiteout)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (p->name[0] == '.') {
|
|
||||||
if (p->len == 1)
|
|
||||||
continue;
|
|
||||||
if (p->len == 2 && p->name[1] == '.')
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
err = -ENOTEMPTY;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
|
|
||||||
{
|
|
||||||
struct ovl_cache_entry *p;
|
|
||||||
|
|
||||||
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
|
|
||||||
list_for_each_entry(p, list, l_node) {
|
|
||||||
struct dentry *dentry;
|
|
||||||
|
|
||||||
if (!p->is_whiteout)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
dentry = lookup_one_len(p->name, upper, p->len);
|
|
||||||
if (IS_ERR(dentry)) {
|
|
||||||
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
|
|
||||||
upper->d_name.name, p->len, p->name,
|
|
||||||
(int) PTR_ERR(dentry));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
ovl_cleanup(upper->d_inode, dentry);
|
|
||||||
dput(dentry);
|
|
||||||
}
|
|
||||||
mutex_unlock(&upper->d_inode->i_mutex);
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,7 +0,0 @@
|
|||||||
kmod(mcoverlay
|
|
||||||
SOURCES
|
|
||||||
copy_up.c dir.c inode.c readdir.c super.c namei.c util.c export.c
|
|
||||||
INSTALL_DEST
|
|
||||||
${KMODDIR}
|
|
||||||
)
|
|
||||||
|
|
||||||
@@ -1,804 +0,0 @@
|
|||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/module.h>
|
|
||||||
#include <linux/fs.h>
|
|
||||||
#include <linux/slab.h>
|
|
||||||
#include <linux/file.h>
|
|
||||||
#include <linux/splice.h>
|
|
||||||
#include <linux/xattr.h>
|
|
||||||
#include <linux/security.h>
|
|
||||||
#include <linux/uaccess.h>
|
|
||||||
#include <linux/sched/signal.h>
|
|
||||||
#include <linux/cred.h>
|
|
||||||
#include <linux/namei.h>
|
|
||||||
#include <linux/fdtable.h>
|
|
||||||
#include <linux/ratelimit.h>
|
|
||||||
#include <linux/exportfs.h>
|
|
||||||
#include "overlayfs.h"
|
|
||||||
|
|
||||||
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
|
|
||||||
|
|
||||||
static bool __read_mostly ovl_check_copy_up;
|
|
||||||
module_param_named(check_copy_up, ovl_check_copy_up, bool,
|
|
||||||
S_IWUSR | S_IRUGO);
|
|
||||||
MODULE_PARM_DESC(ovl_check_copy_up,
|
|
||||||
"Warn on copy-up when causing process also has a R/O fd open");
|
|
||||||
|
|
||||||
static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
|
|
||||||
{
|
|
||||||
const struct dentry *dentry = data;
|
|
||||||
|
|
||||||
if (file_inode(f) == d_inode(dentry))
|
|
||||||
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
|
|
||||||
f, fd, current->pid, current->comm);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Check the fds open by this process and warn if something like the following
|
|
||||||
* scenario is about to occur:
|
|
||||||
*
|
|
||||||
* fd1 = open("foo", O_RDONLY);
|
|
||||||
* fd2 = open("foo", O_RDWR);
|
|
||||||
*/
|
|
||||||
static void ovl_do_check_copy_up(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
if (ovl_check_copy_up)
|
|
||||||
iterate_fd(current->files, 0, ovl_check_fd, dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
|
|
||||||
{
|
|
||||||
ssize_t list_size, size, value_size = 0;
|
|
||||||
char *buf, *name, *value = NULL;
|
|
||||||
int uninitialized_var(error);
|
|
||||||
size_t slen;
|
|
||||||
|
|
||||||
if (!(old->d_inode->i_opflags & IOP_XATTR) ||
|
|
||||||
!(new->d_inode->i_opflags & IOP_XATTR))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
list_size = vfs_listxattr(old, NULL, 0);
|
|
||||||
if (list_size <= 0) {
|
|
||||||
if (list_size == -EOPNOTSUPP)
|
|
||||||
return 0;
|
|
||||||
return list_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
buf = kzalloc(list_size, GFP_KERNEL);
|
|
||||||
if (!buf)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
list_size = vfs_listxattr(old, buf, list_size);
|
|
||||||
if (list_size <= 0) {
|
|
||||||
error = list_size;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (name = buf; list_size; name += slen) {
|
|
||||||
slen = strnlen(name, list_size) + 1;
|
|
||||||
|
|
||||||
/* underlying fs providing us with an broken xattr list? */
|
|
||||||
if (WARN_ON(slen > list_size)) {
|
|
||||||
error = -EIO;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
list_size -= slen;
|
|
||||||
|
|
||||||
if (ovl_is_private_xattr(name))
|
|
||||||
continue;
|
|
||||||
retry:
|
|
||||||
size = vfs_getxattr(old, name, value, value_size);
|
|
||||||
if (size == -ERANGE)
|
|
||||||
size = vfs_getxattr(old, name, NULL, 0);
|
|
||||||
|
|
||||||
if (size < 0) {
|
|
||||||
/* NOFSCHECK */
|
|
||||||
continue;
|
|
||||||
|
|
||||||
error = size;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (size > value_size) {
|
|
||||||
void *new;
|
|
||||||
|
|
||||||
new = krealloc(value, size, GFP_KERNEL);
|
|
||||||
if (!new) {
|
|
||||||
error = -ENOMEM;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
value = new;
|
|
||||||
value_size = size;
|
|
||||||
goto retry;
|
|
||||||
}
|
|
||||||
|
|
||||||
error = security_inode_copy_up_xattr(name);
|
|
||||||
if (error < 0 && error != -EOPNOTSUPP)
|
|
||||||
break;
|
|
||||||
if (error == 1) {
|
|
||||||
error = 0;
|
|
||||||
continue; /* Discard */
|
|
||||||
}
|
|
||||||
error = vfs_setxattr(new, name, value, size, 0);
|
|
||||||
if (error)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
kfree(value);
|
|
||||||
out:
|
|
||||||
kfree(buf);
|
|
||||||
return error;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
|
|
||||||
{
|
|
||||||
struct file *old_file;
|
|
||||||
struct file *new_file;
|
|
||||||
loff_t old_pos = 0;
|
|
||||||
loff_t new_pos = 0;
|
|
||||||
int error = 0;
|
|
||||||
|
|
||||||
if (len == 0)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY);
|
|
||||||
if (IS_ERR(old_file))
|
|
||||||
return PTR_ERR(old_file);
|
|
||||||
|
|
||||||
new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY);
|
|
||||||
if (IS_ERR(new_file)) {
|
|
||||||
error = PTR_ERR(new_file);
|
|
||||||
goto out_fput;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Try to use clone_file_range to clone up within the same fs */
|
|
||||||
error = vfs_clone_file_range(old_file, 0, new_file, 0, len);
|
|
||||||
if (!error)
|
|
||||||
goto out;
|
|
||||||
/* Couldn't clone, so now we try to copy the data */
|
|
||||||
error = 0;
|
|
||||||
|
|
||||||
/* FIXME: copy up sparse files efficiently */
|
|
||||||
while (len) {
|
|
||||||
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
|
|
||||||
long bytes;
|
|
||||||
|
|
||||||
if (len < this_len)
|
|
||||||
this_len = len;
|
|
||||||
|
|
||||||
if (signal_pending_state(TASK_KILLABLE, current)) {
|
|
||||||
error = -EINTR;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
bytes = do_splice_direct(old_file, &old_pos,
|
|
||||||
new_file, &new_pos,
|
|
||||||
this_len, SPLICE_F_MOVE);
|
|
||||||
if (bytes <= 0) {
|
|
||||||
error = bytes;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
WARN_ON(old_pos != new_pos);
|
|
||||||
|
|
||||||
len -= bytes;
|
|
||||||
}
|
|
||||||
out:
|
|
||||||
if (!error)
|
|
||||||
error = vfs_fsync(new_file, 0);
|
|
||||||
fput(new_file);
|
|
||||||
out_fput:
|
|
||||||
fput(old_file);
|
|
||||||
return error;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
|
|
||||||
{
|
|
||||||
struct iattr attr = {
|
|
||||||
.ia_valid =
|
|
||||||
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
|
|
||||||
.ia_atime = stat->atime,
|
|
||||||
.ia_mtime = stat->mtime,
|
|
||||||
};
|
|
||||||
|
|
||||||
return notify_change(upperdentry, &attr, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
|
|
||||||
{
|
|
||||||
int err = 0;
|
|
||||||
|
|
||||||
if (!S_ISLNK(stat->mode)) {
|
|
||||||
struct iattr attr = {
|
|
||||||
.ia_valid = ATTR_MODE,
|
|
||||||
.ia_mode = stat->mode,
|
|
||||||
};
|
|
||||||
err = notify_change(upperdentry, &attr, NULL);
|
|
||||||
}
|
|
||||||
if (!err) {
|
|
||||||
struct iattr attr = {
|
|
||||||
.ia_valid = ATTR_UID | ATTR_GID,
|
|
||||||
.ia_uid = stat->uid,
|
|
||||||
.ia_gid = stat->gid,
|
|
||||||
};
|
|
||||||
err = notify_change(upperdentry, &attr, NULL);
|
|
||||||
}
|
|
||||||
if (!err)
|
|
||||||
ovl_set_timestamps(upperdentry, stat);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper)
|
|
||||||
{
|
|
||||||
struct ovl_fh *fh;
|
|
||||||
int fh_type, fh_len, dwords;
|
|
||||||
void *buf;
|
|
||||||
int buflen = MAX_HANDLE_SZ;
|
|
||||||
uuid_t *uuid = &real->d_sb->s_uuid;
|
|
||||||
|
|
||||||
buf = kmalloc(buflen, GFP_KERNEL);
|
|
||||||
if (!buf)
|
|
||||||
return ERR_PTR(-ENOMEM);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We encode a non-connectable file handle for non-dir, because we
|
|
||||||
* only need to find the lower inode number and we don't want to pay
|
|
||||||
* the price or reconnecting the dentry.
|
|
||||||
*/
|
|
||||||
dwords = buflen >> 2;
|
|
||||||
fh_type = exportfs_encode_fh(real, buf, &dwords, 0);
|
|
||||||
buflen = (dwords << 2);
|
|
||||||
|
|
||||||
fh = ERR_PTR(-EIO);
|
|
||||||
if (WARN_ON(fh_type < 0) ||
|
|
||||||
WARN_ON(buflen > MAX_HANDLE_SZ) ||
|
|
||||||
WARN_ON(fh_type == FILEID_INVALID))
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255);
|
|
||||||
fh_len = offsetof(struct ovl_fh, fid) + buflen;
|
|
||||||
fh = kmalloc(fh_len, GFP_KERNEL);
|
|
||||||
if (!fh) {
|
|
||||||
fh = ERR_PTR(-ENOMEM);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
fh->version = OVL_FH_VERSION;
|
|
||||||
fh->magic = OVL_FH_MAGIC;
|
|
||||||
fh->type = fh_type;
|
|
||||||
fh->flags = OVL_FH_FLAG_CPU_ENDIAN;
|
|
||||||
/*
|
|
||||||
* When we will want to decode an overlay dentry from this handle
|
|
||||||
* and all layers are on the same fs, if we get a disconncted real
|
|
||||||
* dentry when we decode fid, the only way to tell if we should assign
|
|
||||||
* it to upperdentry or to lowerstack is by checking this flag.
|
|
||||||
*/
|
|
||||||
if (is_upper)
|
|
||||||
fh->flags |= OVL_FH_FLAG_PATH_UPPER;
|
|
||||||
fh->len = fh_len;
|
|
||||||
fh->uuid = *uuid;
|
|
||||||
memcpy(fh->fid, buf, buflen);
|
|
||||||
|
|
||||||
out:
|
|
||||||
kfree(buf);
|
|
||||||
return fh;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
|
|
||||||
struct dentry *upper)
|
|
||||||
{
|
|
||||||
const struct ovl_fh *fh = NULL;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* When lower layer doesn't support export operations store a 'null' fh,
|
|
||||||
* so we can use the overlay.origin xattr to distignuish between a copy
|
|
||||||
* up and a pure upper inode.
|
|
||||||
*/
|
|
||||||
if (ovl_can_decode_fh(lower->d_sb)) {
|
|
||||||
fh = ovl_encode_real_fh(lower, false);
|
|
||||||
if (IS_ERR(fh))
|
|
||||||
return PTR_ERR(fh);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Do not fail when upper doesn't support xattrs.
|
|
||||||
*/
|
|
||||||
err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh,
|
|
||||||
fh ? fh->len : 0, 0);
|
|
||||||
kfree(fh);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Store file handle of @upper dir in @index dir entry */
|
|
||||||
static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index)
|
|
||||||
{
|
|
||||||
const struct ovl_fh *fh;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
fh = ovl_encode_real_fh(upper, true);
|
|
||||||
if (IS_ERR(fh))
|
|
||||||
return PTR_ERR(fh);
|
|
||||||
|
|
||||||
err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh, fh->len, 0);
|
|
||||||
|
|
||||||
kfree(fh);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Create and install index entry.
|
|
||||||
*
|
|
||||||
* Caller must hold i_mutex on indexdir.
|
|
||||||
*/
|
|
||||||
static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
|
|
||||||
struct dentry *upper)
|
|
||||||
{
|
|
||||||
struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
|
|
||||||
struct inode *dir = d_inode(indexdir);
|
|
||||||
struct dentry *index = NULL;
|
|
||||||
struct dentry *temp = NULL;
|
|
||||||
struct qstr name = { };
|
|
||||||
int err;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* For now this is only used for creating index entry for directories,
|
|
||||||
* because non-dir are copied up directly to index and then hardlinked
|
|
||||||
* to upper dir.
|
|
||||||
*
|
|
||||||
* TODO: implement create index for non-dir, so we can call it when
|
|
||||||
* encoding file handle for non-dir in case index does not exist.
|
|
||||||
*/
|
|
||||||
if (WARN_ON(!d_is_dir(dentry)))
|
|
||||||
return -EIO;
|
|
||||||
|
|
||||||
/* Directory not expected to be indexed before copy up */
|
|
||||||
if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry))))
|
|
||||||
return -EIO;
|
|
||||||
|
|
||||||
err = ovl_get_index_name(origin, &name);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
temp = ovl_create_temp(indexdir, OVL_CATTR(S_IFDIR | 0));
|
|
||||||
err = PTR_ERR(temp);
|
|
||||||
if (IS_ERR(temp))
|
|
||||||
goto free_name;
|
|
||||||
|
|
||||||
err = ovl_set_upper_fh(upper, temp);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
index = lookup_one_len(name.name, indexdir, name.len);
|
|
||||||
if (IS_ERR(index)) {
|
|
||||||
err = PTR_ERR(index);
|
|
||||||
} else {
|
|
||||||
err = ovl_do_rename(dir, temp, dir, index, 0);
|
|
||||||
dput(index);
|
|
||||||
}
|
|
||||||
out:
|
|
||||||
if (err)
|
|
||||||
ovl_cleanup(dir, temp);
|
|
||||||
dput(temp);
|
|
||||||
free_name:
|
|
||||||
kfree(name.name);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ovl_copy_up_ctx {
|
|
||||||
struct dentry *parent;
|
|
||||||
struct dentry *dentry;
|
|
||||||
struct path lowerpath;
|
|
||||||
struct kstat stat;
|
|
||||||
struct kstat pstat;
|
|
||||||
const char *link;
|
|
||||||
struct dentry *destdir;
|
|
||||||
struct qstr destname;
|
|
||||||
struct dentry *workdir;
|
|
||||||
bool tmpfile;
|
|
||||||
bool origin;
|
|
||||||
bool indexed;
|
|
||||||
};
|
|
||||||
|
|
||||||
static int ovl_link_up(struct ovl_copy_up_ctx *c)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *upper;
|
|
||||||
struct dentry *upperdir = ovl_dentry_upper(c->parent);
|
|
||||||
struct inode *udir = d_inode(upperdir);
|
|
||||||
|
|
||||||
/* Mark parent "impure" because it may now contain non-pure upper */
|
|
||||||
err = ovl_set_impure(c->parent, upperdir);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
err = ovl_set_nlink_lower(c->dentry);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
inode_lock_nested(udir, I_MUTEX_PARENT);
|
|
||||||
upper = lookup_one_len(c->dentry->d_name.name, upperdir,
|
|
||||||
c->dentry->d_name.len);
|
|
||||||
err = PTR_ERR(upper);
|
|
||||||
if (!IS_ERR(upper)) {
|
|
||||||
err = ovl_do_link(ovl_dentry_upper(c->dentry), udir, upper);
|
|
||||||
dput(upper);
|
|
||||||
|
|
||||||
if (!err) {
|
|
||||||
/* Restore timestamps on parent (best effort) */
|
|
||||||
ovl_set_timestamps(upperdir, &c->pstat);
|
|
||||||
ovl_dentry_set_upper_alias(c->dentry);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
inode_unlock(udir);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
err = ovl_set_nlink_upper(c->dentry);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp,
|
|
||||||
struct dentry **newdentry)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *upper;
|
|
||||||
struct inode *udir = d_inode(c->destdir);
|
|
||||||
|
|
||||||
upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
|
|
||||||
if (IS_ERR(upper))
|
|
||||||
return PTR_ERR(upper);
|
|
||||||
|
|
||||||
if (c->tmpfile)
|
|
||||||
err = ovl_do_link(temp, udir, upper);
|
|
||||||
else
|
|
||||||
err = ovl_do_rename(d_inode(c->workdir), temp, udir, upper, 0);
|
|
||||||
|
|
||||||
if (!err)
|
|
||||||
*newdentry = dget(c->tmpfile ? upper : temp);
|
|
||||||
dput(upper);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct dentry *ovl_get_tmpfile(struct ovl_copy_up_ctx *c)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *temp;
|
|
||||||
const struct cred *old_creds = NULL;
|
|
||||||
struct cred *new_creds = NULL;
|
|
||||||
struct ovl_cattr cattr = {
|
|
||||||
/* Can't properly set mode on creation because of the umask */
|
|
||||||
.mode = c->stat.mode & S_IFMT,
|
|
||||||
.rdev = c->stat.rdev,
|
|
||||||
.link = c->link
|
|
||||||
};
|
|
||||||
|
|
||||||
err = security_inode_copy_up(c->dentry, &new_creds);
|
|
||||||
temp = ERR_PTR(err);
|
|
||||||
if (err < 0)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
if (new_creds)
|
|
||||||
old_creds = override_creds(new_creds);
|
|
||||||
|
|
||||||
if (c->tmpfile)
|
|
||||||
temp = ovl_do_tmpfile(c->workdir, c->stat.mode);
|
|
||||||
else
|
|
||||||
temp = ovl_create_temp(c->workdir, &cattr);
|
|
||||||
out:
|
|
||||||
if (new_creds) {
|
|
||||||
revert_creds(old_creds);
|
|
||||||
put_cred(new_creds);
|
|
||||||
}
|
|
||||||
|
|
||||||
return temp;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (S_ISREG(c->stat.mode)) {
|
|
||||||
struct path upperpath;
|
|
||||||
|
|
||||||
ovl_path_upper(c->dentry, &upperpath);
|
|
||||||
BUG_ON(upperpath.dentry != NULL);
|
|
||||||
upperpath.dentry = temp;
|
|
||||||
|
|
||||||
err = ovl_copy_up_data(&c->lowerpath, &upperpath, c->stat.size);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ovl_copy_xattr(c->lowerpath.dentry, temp);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
inode_lock(temp->d_inode);
|
|
||||||
err = ovl_set_attr(temp, &c->stat);
|
|
||||||
inode_unlock(temp->d_inode);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Store identifier of lower inode in upper inode xattr to
|
|
||||||
* allow lookup of the copy up origin inode.
|
|
||||||
*
|
|
||||||
* Don't set origin when we are breaking the association with a lower
|
|
||||||
* hard link.
|
|
||||||
*/
|
|
||||||
if (c->origin) {
|
|
||||||
err = ovl_set_origin(c->dentry, c->lowerpath.dentry, temp);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
|
|
||||||
{
|
|
||||||
struct inode *udir = c->destdir->d_inode;
|
|
||||||
struct inode *inode;
|
|
||||||
struct dentry *newdentry = NULL;
|
|
||||||
struct dentry *temp;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
temp = ovl_get_tmpfile(c);
|
|
||||||
if (IS_ERR(temp))
|
|
||||||
return PTR_ERR(temp);
|
|
||||||
|
|
||||||
err = ovl_copy_up_inode(c, temp);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
if (S_ISDIR(c->stat.mode) && c->indexed) {
|
|
||||||
err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (c->tmpfile) {
|
|
||||||
inode_lock_nested(udir, I_MUTEX_PARENT);
|
|
||||||
err = ovl_install_temp(c, temp, &newdentry);
|
|
||||||
inode_unlock(udir);
|
|
||||||
} else {
|
|
||||||
err = ovl_install_temp(c, temp, &newdentry);
|
|
||||||
}
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
inode = d_inode(c->dentry);
|
|
||||||
ovl_inode_update(inode, newdentry);
|
|
||||||
if (S_ISDIR(inode->i_mode))
|
|
||||||
ovl_set_flag(OVL_WHITEOUTS, inode);
|
|
||||||
|
|
||||||
out:
|
|
||||||
if (err && !c->tmpfile)
|
|
||||||
ovl_cleanup(d_inode(c->workdir), temp);
|
|
||||||
dput(temp);
|
|
||||||
return err;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Copy up a single dentry
|
|
||||||
*
|
|
||||||
* All renames start with copy up of source if necessary. The actual
|
|
||||||
* rename will only proceed once the copy up was successful. Copy up uses
|
|
||||||
* upper parent i_mutex for exclusion. Since rename can change d_parent it
|
|
||||||
* is possible that the copy up will lock the old parent. At that point
|
|
||||||
* the file will have already been copied up anyway.
|
|
||||||
*/
|
|
||||||
static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct ovl_fs *ofs = c->dentry->d_sb->s_fs_info;
|
|
||||||
bool to_index = false;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Indexed non-dir is copied up directly to the index entry and then
|
|
||||||
* hardlinked to upper dir. Indexed dir is copied up to indexdir,
|
|
||||||
* then index entry is created and then copied up dir installed.
|
|
||||||
* Copying dir up to indexdir instead of workdir simplifies locking.
|
|
||||||
*/
|
|
||||||
if (ovl_need_index(c->dentry)) {
|
|
||||||
c->indexed = true;
|
|
||||||
if (S_ISDIR(c->stat.mode))
|
|
||||||
c->workdir = ovl_indexdir(c->dentry->d_sb);
|
|
||||||
else
|
|
||||||
to_index = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index)
|
|
||||||
c->origin = true;
|
|
||||||
|
|
||||||
if (to_index) {
|
|
||||||
c->destdir = ovl_indexdir(c->dentry->d_sb);
|
|
||||||
err = ovl_get_index_name(c->lowerpath.dentry, &c->destname);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
} else if (WARN_ON(!c->parent)) {
|
|
||||||
/* Disconnected dentry must be copied up to index dir */
|
|
||||||
return -EIO;
|
|
||||||
} else {
|
|
||||||
/*
|
|
||||||
* Mark parent "impure" because it may now contain non-pure
|
|
||||||
* upper
|
|
||||||
*/
|
|
||||||
err = ovl_set_impure(c->parent, c->destdir);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Should we copyup with O_TMPFILE or with workdir? */
|
|
||||||
if (S_ISREG(c->stat.mode) && ofs->tmpfile) {
|
|
||||||
c->tmpfile = true;
|
|
||||||
err = ovl_copy_up_locked(c);
|
|
||||||
} else {
|
|
||||||
err = ovl_lock_rename_workdir(c->workdir, c->destdir);
|
|
||||||
if (!err) {
|
|
||||||
err = ovl_copy_up_locked(c);
|
|
||||||
unlock_rename(c->workdir, c->destdir);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
if (c->indexed)
|
|
||||||
ovl_set_flag(OVL_INDEX, d_inode(c->dentry));
|
|
||||||
|
|
||||||
if (to_index) {
|
|
||||||
/* Initialize nlink for copy up of disconnected dentry */
|
|
||||||
err = ovl_set_nlink_upper(c->dentry);
|
|
||||||
} else {
|
|
||||||
struct inode *udir = d_inode(c->destdir);
|
|
||||||
|
|
||||||
/* Restore timestamps on parent (best effort) */
|
|
||||||
inode_lock(udir);
|
|
||||||
ovl_set_timestamps(c->destdir, &c->pstat);
|
|
||||||
inode_unlock(udir);
|
|
||||||
|
|
||||||
ovl_dentry_set_upper_alias(c->dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
out:
|
|
||||||
if (to_index)
|
|
||||||
kfree(c->destname.name);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
|
||||||
int flags)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
DEFINE_DELAYED_CALL(done);
|
|
||||||
struct path parentpath;
|
|
||||||
struct ovl_copy_up_ctx ctx = {
|
|
||||||
.parent = parent,
|
|
||||||
.dentry = dentry,
|
|
||||||
.workdir = ovl_workdir(dentry),
|
|
||||||
};
|
|
||||||
|
|
||||||
if (WARN_ON(!ctx.workdir))
|
|
||||||
return -EROFS;
|
|
||||||
|
|
||||||
ovl_path_lower(dentry, &ctx.lowerpath);
|
|
||||||
err = vfs_getattr(&ctx.lowerpath, &ctx.stat,
|
|
||||||
STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
if (parent) {
|
|
||||||
ovl_path_upper(parent, &parentpath);
|
|
||||||
ctx.destdir = parentpath.dentry;
|
|
||||||
ctx.destname = dentry->d_name;
|
|
||||||
|
|
||||||
err = vfs_getattr(&parentpath, &ctx.pstat,
|
|
||||||
STATX_ATIME | STATX_MTIME,
|
|
||||||
AT_STATX_SYNC_AS_STAT);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* maybe truncate regular file. this has no effect on dirs */
|
|
||||||
if (flags & O_TRUNC)
|
|
||||||
ctx.stat.size = 0;
|
|
||||||
|
|
||||||
if (S_ISLNK(ctx.stat.mode)) {
|
|
||||||
ctx.link = vfs_get_link(ctx.lowerpath.dentry, &done);
|
|
||||||
if (IS_ERR(ctx.link))
|
|
||||||
return PTR_ERR(ctx.link);
|
|
||||||
}
|
|
||||||
ovl_do_check_copy_up(ctx.lowerpath.dentry);
|
|
||||||
|
|
||||||
err = ovl_copy_up_start(dentry);
|
|
||||||
/* err < 0: interrupted, err > 0: raced with another copy-up */
|
|
||||||
if (unlikely(err)) {
|
|
||||||
if (err > 0)
|
|
||||||
err = 0;
|
|
||||||
} else {
|
|
||||||
if (!ovl_dentry_upper(dentry))
|
|
||||||
err = ovl_do_copy_up(&ctx);
|
|
||||||
if (!err && parent && !ovl_dentry_has_upper_alias(dentry))
|
|
||||||
err = ovl_link_up(&ctx);
|
|
||||||
ovl_copy_up_end(dentry);
|
|
||||||
}
|
|
||||||
do_delayed_call(&done);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_copy_up_flags(struct dentry *dentry, int flags)
|
|
||||||
{
|
|
||||||
int err = 0;
|
|
||||||
const struct cred *old_cred = ovl_override_creds(dentry->d_sb);
|
|
||||||
bool disconnected = (dentry->d_flags & DCACHE_DISCONNECTED);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* With NFS export, copy up can get called for a disconnected non-dir.
|
|
||||||
* In this case, we will copy up lower inode to index dir without
|
|
||||||
* linking it to upper dir.
|
|
||||||
*/
|
|
||||||
if (WARN_ON(disconnected && d_is_dir(dentry)))
|
|
||||||
return -EIO;
|
|
||||||
|
|
||||||
while (!err) {
|
|
||||||
struct dentry *next;
|
|
||||||
struct dentry *parent = NULL;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Check if copy-up has happened as well as for upper alias (in
|
|
||||||
* case of hard links) is there.
|
|
||||||
*
|
|
||||||
* Both checks are lockless:
|
|
||||||
* - false negatives: will recheck under oi->lock
|
|
||||||
* - false positives:
|
|
||||||
* + ovl_dentry_upper() uses memory barriers to ensure the
|
|
||||||
* upper dentry is up-to-date
|
|
||||||
* + ovl_dentry_has_upper_alias() relies on locking of
|
|
||||||
* upper parent i_rwsem to prevent reordering copy-up
|
|
||||||
* with rename.
|
|
||||||
*/
|
|
||||||
if (ovl_dentry_upper(dentry) &&
|
|
||||||
(ovl_dentry_has_upper_alias(dentry) || disconnected))
|
|
||||||
break;
|
|
||||||
|
|
||||||
next = dget(dentry);
|
|
||||||
/* find the topmost dentry not yet copied up */
|
|
||||||
for (; !disconnected;) {
|
|
||||||
parent = dget_parent(next);
|
|
||||||
|
|
||||||
if (ovl_dentry_upper(parent))
|
|
||||||
break;
|
|
||||||
|
|
||||||
dput(next);
|
|
||||||
next = parent;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ovl_copy_up_one(parent, next, flags);
|
|
||||||
|
|
||||||
dput(parent);
|
|
||||||
dput(next);
|
|
||||||
}
|
|
||||||
revert_creds(old_cred);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_copy_up(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return ovl_copy_up_flags(dentry, 0);
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,853 +0,0 @@
|
|||||||
/*
|
|
||||||
* Overlayfs NFS export support.
|
|
||||||
*
|
|
||||||
* Amir Goldstein <amir73il@gmail.com>
|
|
||||||
*
|
|
||||||
* Copyright (C) 2017-2018 CTERA Networks. All Rights Reserved.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/fs.h>
|
|
||||||
#include <linux/cred.h>
|
|
||||||
#include <linux/mount.h>
|
|
||||||
#include <linux/namei.h>
|
|
||||||
#include <linux/xattr.h>
|
|
||||||
#include <linux/exportfs.h>
|
|
||||||
#include <linux/ratelimit.h>
|
|
||||||
#include <linux/version.h>
|
|
||||||
#include "overlayfs.h"
|
|
||||||
|
|
||||||
static int ovl_encode_maybe_copy_up(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (ovl_dentry_upper(dentry))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (!err) {
|
|
||||||
err = ovl_copy_up(dentry);
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (err) {
|
|
||||||
pr_warn_ratelimited("overlayfs: failed to copy up on encode (%pd2, err=%i)\n",
|
|
||||||
dentry, err);
|
|
||||||
}
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Before encoding a non-upper directory file handle from real layer N, we need
|
|
||||||
* to check if it will be possible to reconnect an overlay dentry from the real
|
|
||||||
* lower decoded dentry. This is done by following the overlay ancestry up to a
|
|
||||||
* "layer N connected" ancestor and verifying that all parents along the way are
|
|
||||||
* "layer N connectable". If an ancestor that is NOT "layer N connectable" is
|
|
||||||
* found, we need to copy up an ancestor, which is "layer N connectable", thus
|
|
||||||
* making that ancestor "layer N connected". For example:
|
|
||||||
*
|
|
||||||
* layer 1: /a
|
|
||||||
* layer 2: /a/b/c
|
|
||||||
*
|
|
||||||
* The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is
|
|
||||||
* copied up and renamed, upper dir /a will be indexed by lower dir /a from
|
|
||||||
* layer 1. The dir /a from layer 2 will never be indexed, so the algorithm (*)
|
|
||||||
* in ovl_lookup_real_ancestor() will not be able to lookup a connected overlay
|
|
||||||
* dentry from the connected lower dentry /a/b/c.
|
|
||||||
*
|
|
||||||
* To avoid this problem on decode time, we need to copy up an ancestor of
|
|
||||||
* /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is
|
|
||||||
* /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected"
|
|
||||||
* and when the time comes to decode the file handle from lower dentry /a/b/c,
|
|
||||||
* ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding
|
|
||||||
* a connected overlay dentry will be accomplished.
|
|
||||||
*
|
|
||||||
* (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup an
|
|
||||||
* entry /a in the lower layers above layer N and find the indexed dir /a from
|
|
||||||
* layer 1. If that improvement is made, then the check for "layer N connected"
|
|
||||||
* will need to verify there are no redirects in lower layers above N. In the
|
|
||||||
* example above, /a will be "layer 2 connectable". However, if layer 2 dir /a
|
|
||||||
* is a target of a layer 1 redirect, then /a will NOT be "layer 2 connectable":
|
|
||||||
*
|
|
||||||
* layer 1: /A (redirect = /a)
|
|
||||||
* layer 2: /a/b/c
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* Return the lowest layer for encoding a connectable file handle */
|
|
||||||
static int ovl_connectable_layer(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct ovl_entry *oe = OVL_E(dentry);
|
|
||||||
|
|
||||||
/* We can get overlay root from root of any layer */
|
|
||||||
if (dentry == dentry->d_sb->s_root)
|
|
||||||
return oe->numlower;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If it's an unindexed merge dir, then it's not connectable with any
|
|
||||||
* lower layer
|
|
||||||
*/
|
|
||||||
if (ovl_dentry_upper(dentry) &&
|
|
||||||
!ovl_test_flag(OVL_INDEX, d_inode(dentry)))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
/* We can get upper/overlay path from indexed/lower dentry */
|
|
||||||
return oe->lowerstack[0].layer->idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* @dentry is "connected" if all ancestors up to root or a "connected" ancestor
|
|
||||||
* have the same uppermost lower layer as the origin's layer. We may need to
|
|
||||||
* copy up a "connectable" ancestor to make it "connected". A "connected" dentry
|
|
||||||
* cannot become non "connected", so cache positive result in dentry flags.
|
|
||||||
*
|
|
||||||
* Return the connected origin layer or < 0 on error.
|
|
||||||
*/
|
|
||||||
static int ovl_connect_layer(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct dentry *next, *parent = NULL;
|
|
||||||
int origin_layer;
|
|
||||||
int err = 0;
|
|
||||||
|
|
||||||
if (WARN_ON(dentry == dentry->d_sb->s_root) ||
|
|
||||||
WARN_ON(!ovl_dentry_lower(dentry)))
|
|
||||||
return -EIO;
|
|
||||||
|
|
||||||
origin_layer = OVL_E(dentry)->lowerstack[0].layer->idx;
|
|
||||||
if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry))
|
|
||||||
return origin_layer;
|
|
||||||
|
|
||||||
/* Find the topmost origin layer connectable ancestor of @dentry */
|
|
||||||
next = dget(dentry);
|
|
||||||
for (;;) {
|
|
||||||
parent = dget_parent(next);
|
|
||||||
if (WARN_ON(parent == next)) {
|
|
||||||
err = -EIO;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If @parent is not origin layer connectable, then copy up
|
|
||||||
* @next which is origin layer connectable and we are done.
|
|
||||||
*/
|
|
||||||
if (ovl_connectable_layer(parent) < origin_layer) {
|
|
||||||
err = ovl_encode_maybe_copy_up(next);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If @parent is connected or indexed we are done */
|
|
||||||
if (ovl_dentry_test_flag(OVL_E_CONNECTED, parent) ||
|
|
||||||
ovl_test_flag(OVL_INDEX, d_inode(parent)))
|
|
||||||
break;
|
|
||||||
|
|
||||||
dput(next);
|
|
||||||
next = parent;
|
|
||||||
}
|
|
||||||
|
|
||||||
dput(parent);
|
|
||||||
dput(next);
|
|
||||||
|
|
||||||
if (!err)
|
|
||||||
ovl_dentry_set_flag(OVL_E_CONNECTED, dentry);
|
|
||||||
|
|
||||||
return err ?: origin_layer;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We only need to encode origin if there is a chance that the same object was
|
|
||||||
* encoded pre copy up and then we need to stay consistent with the same
|
|
||||||
* encoding also after copy up. If non-pure upper is not indexed, then it was
|
|
||||||
* copied up before NFS export was enabled. In that case we don't need to worry
|
|
||||||
* about staying consistent with pre copy up encoding and we encode an upper
|
|
||||||
* file handle. Overlay root dentry is a private case of non-indexed upper.
|
|
||||||
*
|
|
||||||
* The following table summarizes the different file handle encodings used for
|
|
||||||
* different overlay object types:
|
|
||||||
*
|
|
||||||
* Object type | Encoding
|
|
||||||
* --------------------------------
|
|
||||||
* Pure upper | U
|
|
||||||
* Non-indexed upper | U
|
|
||||||
* Indexed upper | L (*)
|
|
||||||
* Non-upper | L (*)
|
|
||||||
*
|
|
||||||
* U = upper file handle
|
|
||||||
* L = lower file handle
|
|
||||||
*
|
|
||||||
* (*) Connecting an overlay dir from real lower dentry is not always
|
|
||||||
* possible when there are redirects in lower layers and non-indexed merge dirs.
|
|
||||||
* To mitigate those case, we may copy up the lower dir ancestor before encode
|
|
||||||
* a lower dir file handle.
|
|
||||||
*
|
|
||||||
* Return 0 for upper file handle, > 0 for lower file handle or < 0 on error.
|
|
||||||
*/
|
|
||||||
static int ovl_check_encode_origin(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
|
|
||||||
|
|
||||||
/* Upper file handle for pure upper */
|
|
||||||
if (!ovl_dentry_lower(dentry))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Upper file handle for non-indexed upper.
|
|
||||||
*
|
|
||||||
* Root is never indexed, so if there's an upper layer, encode upper for
|
|
||||||
* root.
|
|
||||||
*/
|
|
||||||
if (ovl_dentry_upper(dentry) &&
|
|
||||||
!ovl_test_flag(OVL_INDEX, d_inode(dentry)))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Decoding a merge dir, whose origin's ancestor is under a redirected
|
|
||||||
* lower dir or under a non-indexed upper is not always possible.
|
|
||||||
* ovl_connect_layer() will try to make origin's layer "connected" by
|
|
||||||
* copying up a "connectable" ancestor.
|
|
||||||
*/
|
|
||||||
if (d_is_dir(dentry) && ofs->upper_mnt)
|
|
||||||
return ovl_connect_layer(dentry);
|
|
||||||
|
|
||||||
/* Lower file handle for indexed and non-upper dir/non-dir */
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen)
|
|
||||||
{
|
|
||||||
struct ovl_fh *fh = NULL;
|
|
||||||
int err, enc_lower;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Check if we should encode a lower or upper file handle and maybe
|
|
||||||
* copy up an ancestor to make lower file handle connectable.
|
|
||||||
*/
|
|
||||||
err = enc_lower = ovl_check_encode_origin(dentry);
|
|
||||||
if (enc_lower < 0)
|
|
||||||
goto fail;
|
|
||||||
|
|
||||||
/* Encode an upper or lower file handle */
|
|
||||||
fh = ovl_encode_real_fh(enc_lower ? ovl_dentry_lower(dentry) :
|
|
||||||
ovl_dentry_upper(dentry), !enc_lower);
|
|
||||||
err = PTR_ERR(fh);
|
|
||||||
if (IS_ERR(fh))
|
|
||||||
goto fail;
|
|
||||||
|
|
||||||
err = -EOVERFLOW;
|
|
||||||
if (fh->len > buflen)
|
|
||||||
goto fail;
|
|
||||||
|
|
||||||
memcpy(buf, (char *)fh, fh->len);
|
|
||||||
err = fh->len;
|
|
||||||
|
|
||||||
out:
|
|
||||||
kfree(fh);
|
|
||||||
return err;
|
|
||||||
|
|
||||||
fail:
|
|
||||||
pr_warn_ratelimited("overlayfs: failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n",
|
|
||||||
dentry, err, buflen, fh ? (int)fh->len : 0,
|
|
||||||
fh ? fh->type : 0);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len)
|
|
||||||
{
|
|
||||||
int res, len = *max_len << 2;
|
|
||||||
|
|
||||||
res = ovl_d_to_fh(dentry, (char *)fid, len);
|
|
||||||
if (res <= 0)
|
|
||||||
return FILEID_INVALID;
|
|
||||||
|
|
||||||
len = res;
|
|
||||||
|
|
||||||
/* Round up to dwords */
|
|
||||||
*max_len = (len + 3) >> 2;
|
|
||||||
return OVL_FILEID;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
|
|
||||||
struct inode *parent)
|
|
||||||
{
|
|
||||||
struct dentry *dentry;
|
|
||||||
int type;
|
|
||||||
|
|
||||||
/* TODO: encode connectable file handles */
|
|
||||||
if (parent)
|
|
||||||
return FILEID_INVALID;
|
|
||||||
|
|
||||||
dentry = d_find_any_alias(inode);
|
|
||||||
if (WARN_ON(!dentry))
|
|
||||||
return FILEID_INVALID;
|
|
||||||
|
|
||||||
type = ovl_dentry_to_fh(dentry, fid, max_len);
|
|
||||||
|
|
||||||
dput(dentry);
|
|
||||||
return type;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0)
|
|
||||||
/*
|
|
||||||
* Find or instantiate an overlay dentry from real dentries and index.
|
|
||||||
*/
|
|
||||||
static struct dentry *ovl_obtain_alias(struct super_block *sb,
|
|
||||||
struct dentry *upper_alias,
|
|
||||||
struct ovl_path *lowerpath,
|
|
||||||
struct dentry *index)
|
|
||||||
{
|
|
||||||
struct dentry *lower = lowerpath ? lowerpath->dentry : NULL;
|
|
||||||
struct dentry *upper = upper_alias ?: index;
|
|
||||||
struct dentry *dentry;
|
|
||||||
struct inode *inode;
|
|
||||||
struct ovl_entry *oe;
|
|
||||||
struct ovl_inode_params oip = {
|
|
||||||
.lowerpath = lowerpath,
|
|
||||||
.index = index,
|
|
||||||
.numlower = !!lower
|
|
||||||
};
|
|
||||||
|
|
||||||
/* We get overlay directory dentries with ovl_lookup_real() */
|
|
||||||
if (d_is_dir(upper ?: lower))
|
|
||||||
return ERR_PTR(-EIO);
|
|
||||||
|
|
||||||
oip.upperdentry = dget(upper);
|
|
||||||
inode = ovl_get_inode(sb, &oip);
|
|
||||||
if (IS_ERR(inode)) {
|
|
||||||
dput(upper);
|
|
||||||
return ERR_CAST(inode);
|
|
||||||
}
|
|
||||||
|
|
||||||
dentry = d_find_any_alias(inode);
|
|
||||||
if (!dentry) {
|
|
||||||
dentry = d_alloc_anon(inode->i_sb);
|
|
||||||
if (!dentry)
|
|
||||||
goto nomem;
|
|
||||||
oe = ovl_alloc_entry(lower ? 1 : 0);
|
|
||||||
if (!oe)
|
|
||||||
goto nomem;
|
|
||||||
|
|
||||||
if (lower) {
|
|
||||||
oe->lowerstack->dentry = dget(lower);
|
|
||||||
oe->lowerstack->layer = lowerpath->layer;
|
|
||||||
}
|
|
||||||
dentry->d_fsdata = oe;
|
|
||||||
if (upper_alias)
|
|
||||||
ovl_dentry_set_upper_alias(dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
return d_instantiate_anon(dentry, inode);
|
|
||||||
|
|
||||||
nomem:
|
|
||||||
iput(inode);
|
|
||||||
dput(dentry);
|
|
||||||
return ERR_PTR(-ENOMEM);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Get the upper or lower dentry in stach whose on layer @idx */
|
|
||||||
static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx)
|
|
||||||
{
|
|
||||||
struct ovl_entry *oe = dentry->d_fsdata;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
if (!idx)
|
|
||||||
return ovl_dentry_upper(dentry);
|
|
||||||
|
|
||||||
for (i = 0; i < oe->numlower; i++) {
|
|
||||||
if (oe->lowerstack[i].layer->idx == idx)
|
|
||||||
return oe->lowerstack[i].dentry;
|
|
||||||
}
|
|
||||||
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Lookup a child overlay dentry to get a connected overlay dentry whose real
|
|
||||||
* dentry is @real. If @real is on upper layer, we lookup a child overlay
|
|
||||||
* dentry with the same name as the real dentry. Otherwise, we need to consult
|
|
||||||
* index for lookup.
|
|
||||||
*/
|
|
||||||
static struct dentry *ovl_lookup_real_one(struct dentry *connected,
|
|
||||||
struct dentry *real,
|
|
||||||
struct ovl_layer *layer)
|
|
||||||
{
|
|
||||||
struct inode *dir = d_inode(connected);
|
|
||||||
struct dentry *this, *parent = NULL;
|
|
||||||
struct name_snapshot name;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Lookup child overlay dentry by real name. The dir mutex protects us
|
|
||||||
* from racing with overlay rename. If the overlay dentry that is above
|
|
||||||
* real has already been moved to a parent that is not under the
|
|
||||||
* connected overlay dir, we return -ECHILD and restart the lookup of
|
|
||||||
* connected real path from the top.
|
|
||||||
*/
|
|
||||||
inode_lock_nested(dir, I_MUTEX_PARENT);
|
|
||||||
err = -ECHILD;
|
|
||||||
parent = dget_parent(real);
|
|
||||||
if (ovl_dentry_real_at(connected, layer->idx) != parent)
|
|
||||||
goto fail;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We also need to take a snapshot of real dentry name to protect us
|
|
||||||
* from racing with underlying layer rename. In this case, we don't
|
|
||||||
* care about returning ESTALE, only from dereferencing a free name
|
|
||||||
* pointer because we hold no lock on the real dentry.
|
|
||||||
*/
|
|
||||||
take_dentry_name_snapshot(&name, real);
|
|
||||||
this = lookup_one_len(name.name, connected, strlen(name.name));
|
|
||||||
err = PTR_ERR(this);
|
|
||||||
if (IS_ERR(this)) {
|
|
||||||
goto fail;
|
|
||||||
} else if (!this || !this->d_inode) {
|
|
||||||
dput(this);
|
|
||||||
err = -ENOENT;
|
|
||||||
goto fail;
|
|
||||||
} else if (ovl_dentry_real_at(this, layer->idx) != real) {
|
|
||||||
dput(this);
|
|
||||||
err = -ESTALE;
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
|
|
||||||
out:
|
|
||||||
release_dentry_name_snapshot(&name);
|
|
||||||
dput(parent);
|
|
||||||
inode_unlock(dir);
|
|
||||||
return this;
|
|
||||||
|
|
||||||
fail:
|
|
||||||
pr_warn_ratelimited("overlayfs: failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n",
|
|
||||||
real, layer->idx, connected, err);
|
|
||||||
this = ERR_PTR(err);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct dentry *ovl_lookup_real(struct super_block *sb,
|
|
||||||
struct dentry *real,
|
|
||||||
struct ovl_layer *layer);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Lookup an indexed or hashed overlay dentry by real inode.
|
|
||||||
*/
|
|
||||||
static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
|
|
||||||
struct dentry *real,
|
|
||||||
struct ovl_layer *layer)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = sb->s_fs_info;
|
|
||||||
struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt };
|
|
||||||
struct dentry *index = NULL;
|
|
||||||
struct dentry *this = NULL;
|
|
||||||
struct inode *inode;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Decoding upper dir from index is expensive, so first try to lookup
|
|
||||||
* overlay dentry in inode/dcache.
|
|
||||||
*/
|
|
||||||
inode = ovl_lookup_inode(sb, real, !layer->idx);
|
|
||||||
if (IS_ERR(inode))
|
|
||||||
return ERR_CAST(inode);
|
|
||||||
if (inode) {
|
|
||||||
this = d_find_any_alias(inode);
|
|
||||||
iput(inode);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* For decoded lower dir file handle, lookup index by origin to check
|
|
||||||
* if lower dir was copied up and and/or removed.
|
|
||||||
*/
|
|
||||||
if (!this && layer->idx && ofs->indexdir && !WARN_ON(!d_is_dir(real))) {
|
|
||||||
index = ovl_lookup_index(ofs, NULL, real, false);
|
|
||||||
if (IS_ERR(index))
|
|
||||||
return index;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Get connected upper overlay dir from index */
|
|
||||||
if (index) {
|
|
||||||
struct dentry *upper = ovl_index_upper(ofs, index);
|
|
||||||
|
|
||||||
dput(index);
|
|
||||||
if (IS_ERR_OR_NULL(upper))
|
|
||||||
return upper;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* ovl_lookup_real() in lower layer may call recursively once to
|
|
||||||
* ovl_lookup_real() in upper layer. The first level call walks
|
|
||||||
* back lower parents to the topmost indexed parent. The second
|
|
||||||
* recursive call walks back from indexed upper to the topmost
|
|
||||||
* connected/hashed upper parent (or up to root).
|
|
||||||
*/
|
|
||||||
this = ovl_lookup_real(sb, upper, &upper_layer);
|
|
||||||
dput(upper);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (IS_ERR_OR_NULL(this))
|
|
||||||
return this;
|
|
||||||
|
|
||||||
if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) {
|
|
||||||
dput(this);
|
|
||||||
this = ERR_PTR(-EIO);
|
|
||||||
}
|
|
||||||
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Lookup an indexed or hashed overlay dentry, whose real dentry is an
|
|
||||||
* ancestor of @real.
|
|
||||||
*/
|
|
||||||
static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb,
|
|
||||||
struct dentry *real,
|
|
||||||
struct ovl_layer *layer)
|
|
||||||
{
|
|
||||||
struct dentry *next, *parent = NULL;
|
|
||||||
struct dentry *ancestor = ERR_PTR(-EIO);
|
|
||||||
|
|
||||||
if (real == layer->mnt->mnt_root)
|
|
||||||
return dget(sb->s_root);
|
|
||||||
|
|
||||||
/* Find the topmost indexed or hashed ancestor */
|
|
||||||
next = dget(real);
|
|
||||||
for (;;) {
|
|
||||||
parent = dget_parent(next);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Lookup a matching overlay dentry in inode/dentry
|
|
||||||
* cache or in index by real inode.
|
|
||||||
*/
|
|
||||||
ancestor = ovl_lookup_real_inode(sb, next, layer);
|
|
||||||
if (ancestor)
|
|
||||||
break;
|
|
||||||
|
|
||||||
if (parent == layer->mnt->mnt_root) {
|
|
||||||
ancestor = dget(sb->s_root);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If @real has been moved out of the layer root directory,
|
|
||||||
* we will eventully hit the real fs root. This cannot happen
|
|
||||||
* by legit overlay rename, so we return error in that case.
|
|
||||||
*/
|
|
||||||
if (parent == next) {
|
|
||||||
ancestor = ERR_PTR(-EXDEV);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
dput(next);
|
|
||||||
next = parent;
|
|
||||||
}
|
|
||||||
|
|
||||||
dput(parent);
|
|
||||||
dput(next);
|
|
||||||
|
|
||||||
return ancestor;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Lookup a connected overlay dentry whose real dentry is @real.
|
|
||||||
* If @real is on upper layer, we lookup a child overlay dentry with the same
|
|
||||||
* path the real dentry. Otherwise, we need to consult index for lookup.
|
|
||||||
*/
|
|
||||||
static struct dentry *ovl_lookup_real(struct super_block *sb,
|
|
||||||
struct dentry *real,
|
|
||||||
struct ovl_layer *layer)
|
|
||||||
{
|
|
||||||
struct dentry *connected;
|
|
||||||
int err = 0;
|
|
||||||
|
|
||||||
connected = ovl_lookup_real_ancestor(sb, real, layer);
|
|
||||||
if (IS_ERR(connected))
|
|
||||||
return connected;
|
|
||||||
|
|
||||||
while (!err) {
|
|
||||||
struct dentry *next, *this;
|
|
||||||
struct dentry *parent = NULL;
|
|
||||||
struct dentry *real_connected = ovl_dentry_real_at(connected,
|
|
||||||
layer->idx);
|
|
||||||
|
|
||||||
if (real_connected == real)
|
|
||||||
break;
|
|
||||||
|
|
||||||
/* Find the topmost dentry not yet connected */
|
|
||||||
next = dget(real);
|
|
||||||
for (;;) {
|
|
||||||
parent = dget_parent(next);
|
|
||||||
|
|
||||||
if (parent == real_connected)
|
|
||||||
break;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If real has been moved out of 'real_connected',
|
|
||||||
* we will not find 'real_connected' and hit the layer
|
|
||||||
* root. In that case, we need to restart connecting.
|
|
||||||
* This game can go on forever in the worst case. We
|
|
||||||
* may want to consider taking s_vfs_rename_mutex if
|
|
||||||
* this happens more than once.
|
|
||||||
*/
|
|
||||||
if (parent == layer->mnt->mnt_root) {
|
|
||||||
dput(connected);
|
|
||||||
connected = dget(sb->s_root);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If real file has been moved out of the layer root
|
|
||||||
* directory, we will eventully hit the real fs root.
|
|
||||||
* This cannot happen by legit overlay rename, so we
|
|
||||||
* return error in that case.
|
|
||||||
*/
|
|
||||||
if (parent == next) {
|
|
||||||
err = -EXDEV;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
dput(next);
|
|
||||||
next = parent;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!err) {
|
|
||||||
this = ovl_lookup_real_one(connected, next, layer);
|
|
||||||
if (IS_ERR(this))
|
|
||||||
err = PTR_ERR(this);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Lookup of child in overlay can fail when racing with
|
|
||||||
* overlay rename of child away from 'connected' parent.
|
|
||||||
* In this case, we need to restart the lookup from the
|
|
||||||
* top, because we cannot trust that 'real_connected' is
|
|
||||||
* still an ancestor of 'real'. There is a good chance
|
|
||||||
* that the renamed overlay ancestor is now in cache, so
|
|
||||||
* ovl_lookup_real_ancestor() will find it and we can
|
|
||||||
* continue to connect exactly from where lookup failed.
|
|
||||||
*/
|
|
||||||
if (err == -ECHILD) {
|
|
||||||
this = ovl_lookup_real_ancestor(sb, real,
|
|
||||||
layer);
|
|
||||||
err = PTR_ERR_OR_ZERO(this);
|
|
||||||
}
|
|
||||||
if (!err) {
|
|
||||||
dput(connected);
|
|
||||||
connected = this;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
dput(parent);
|
|
||||||
dput(next);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (err)
|
|
||||||
goto fail;
|
|
||||||
|
|
||||||
return connected;
|
|
||||||
|
|
||||||
fail:
|
|
||||||
pr_warn_ratelimited("overlayfs: failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n",
|
|
||||||
real, layer->idx, connected, err);
|
|
||||||
dput(connected);
|
|
||||||
return ERR_PTR(err);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Get an overlay dentry from upper/lower real dentries and index.
|
|
||||||
*/
|
|
||||||
static struct dentry *ovl_get_dentry(struct super_block *sb,
|
|
||||||
struct dentry *upper,
|
|
||||||
struct ovl_path *lowerpath,
|
|
||||||
struct dentry *index)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = sb->s_fs_info;
|
|
||||||
struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt };
|
|
||||||
struct ovl_layer *layer = upper ? &upper_layer : lowerpath->layer;
|
|
||||||
struct dentry *real = upper ?: (index ?: lowerpath->dentry);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Obtain a disconnected overlay dentry from a non-dir real dentry
|
|
||||||
* and index.
|
|
||||||
*/
|
|
||||||
if (!d_is_dir(real))
|
|
||||||
return ovl_obtain_alias(sb, upper, lowerpath, index);
|
|
||||||
|
|
||||||
/* Removed empty directory? */
|
|
||||||
if ((real->d_flags & DCACHE_DISCONNECTED) || d_unhashed(real))
|
|
||||||
return ERR_PTR(-ENOENT);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If real dentry is connected and hashed, get a connected overlay
|
|
||||||
* dentry whose real dentry is @real.
|
|
||||||
*/
|
|
||||||
return ovl_lookup_real(sb, real, layer);
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct dentry *ovl_upper_fh_to_d(struct super_block *sb,
|
|
||||||
struct ovl_fh *fh)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = sb->s_fs_info;
|
|
||||||
struct dentry *dentry;
|
|
||||||
struct dentry *upper;
|
|
||||||
|
|
||||||
if (!ofs->upper_mnt)
|
|
||||||
return ERR_PTR(-EACCES);
|
|
||||||
|
|
||||||
upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true);
|
|
||||||
if (IS_ERR_OR_NULL(upper))
|
|
||||||
return upper;
|
|
||||||
|
|
||||||
dentry = ovl_get_dentry(sb, upper, NULL, NULL);
|
|
||||||
dput(upper);
|
|
||||||
|
|
||||||
return dentry;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
|
|
||||||
struct ovl_fh *fh)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = sb->s_fs_info;
|
|
||||||
struct ovl_path origin = { };
|
|
||||||
struct ovl_path *stack = &origin;
|
|
||||||
struct dentry *dentry = NULL;
|
|
||||||
struct dentry *index = NULL;
|
|
||||||
struct inode *inode;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
/* First lookup overlay inode in inode cache by origin fh */
|
|
||||||
err = ovl_check_origin_fh(ofs, fh, false, NULL, &stack);
|
|
||||||
if (err)
|
|
||||||
return ERR_PTR(err);
|
|
||||||
|
|
||||||
if (!d_is_dir(origin.dentry) ||
|
|
||||||
!(origin.dentry->d_flags & DCACHE_DISCONNECTED)) {
|
|
||||||
inode = ovl_lookup_inode(sb, origin.dentry, false);
|
|
||||||
err = PTR_ERR(inode);
|
|
||||||
if (IS_ERR(inode))
|
|
||||||
goto out_err;
|
|
||||||
if (inode) {
|
|
||||||
dentry = d_find_any_alias(inode);
|
|
||||||
iput(inode);
|
|
||||||
if (dentry)
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Then lookup indexed upper/whiteout by origin fh */
|
|
||||||
if (ofs->indexdir) {
|
|
||||||
index = ovl_get_index_fh(ofs, fh);
|
|
||||||
err = PTR_ERR(index);
|
|
||||||
if (IS_ERR(index)) {
|
|
||||||
index = NULL;
|
|
||||||
goto out_err;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Then try to get a connected upper dir by index */
|
|
||||||
if (index && d_is_dir(index)) {
|
|
||||||
struct dentry *upper = ovl_index_upper(ofs, index);
|
|
||||||
|
|
||||||
err = PTR_ERR(upper);
|
|
||||||
if (IS_ERR_OR_NULL(upper))
|
|
||||||
goto out_err;
|
|
||||||
|
|
||||||
dentry = ovl_get_dentry(sb, upper, NULL, NULL);
|
|
||||||
dput(upper);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Otherwise, get a connected non-upper dir or disconnected non-dir */
|
|
||||||
if (d_is_dir(origin.dentry) &&
|
|
||||||
(origin.dentry->d_flags & DCACHE_DISCONNECTED)) {
|
|
||||||
dput(origin.dentry);
|
|
||||||
origin.dentry = NULL;
|
|
||||||
err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack);
|
|
||||||
if (err)
|
|
||||||
goto out_err;
|
|
||||||
}
|
|
||||||
if (index) {
|
|
||||||
err = ovl_verify_origin(index, origin.dentry, false);
|
|
||||||
if (err)
|
|
||||||
goto out_err;
|
|
||||||
}
|
|
||||||
|
|
||||||
dentry = ovl_get_dentry(sb, NULL, &origin, index);
|
|
||||||
|
|
||||||
out:
|
|
||||||
dput(origin.dentry);
|
|
||||||
dput(index);
|
|
||||||
return dentry;
|
|
||||||
|
|
||||||
out_err:
|
|
||||||
dentry = ERR_PTR(err);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid,
|
|
||||||
int fh_len, int fh_type)
|
|
||||||
{
|
|
||||||
struct dentry *dentry = NULL;
|
|
||||||
struct ovl_fh *fh = (struct ovl_fh *) fid;
|
|
||||||
int len = fh_len << 2;
|
|
||||||
unsigned int flags = 0;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
err = -EINVAL;
|
|
||||||
if (fh_type != OVL_FILEID)
|
|
||||||
goto out_err;
|
|
||||||
|
|
||||||
err = ovl_check_fh_len(fh, len);
|
|
||||||
if (err)
|
|
||||||
goto out_err;
|
|
||||||
|
|
||||||
flags = fh->flags;
|
|
||||||
dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ?
|
|
||||||
ovl_upper_fh_to_d(sb, fh) :
|
|
||||||
ovl_lower_fh_to_d(sb, fh);
|
|
||||||
err = PTR_ERR(dentry);
|
|
||||||
if (IS_ERR(dentry) && err != -ESTALE)
|
|
||||||
goto out_err;
|
|
||||||
|
|
||||||
return dentry;
|
|
||||||
|
|
||||||
out_err:
|
|
||||||
pr_warn_ratelimited("overlayfs: failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n",
|
|
||||||
len, fh_type, flags, err);
|
|
||||||
return ERR_PTR(err);
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid,
|
|
||||||
int fh_len, int fh_type)
|
|
||||||
{
|
|
||||||
pr_warn_ratelimited("overlayfs: connectable file handles not supported; use 'no_subtree_check' exportfs option.\n");
|
|
||||||
return ERR_PTR(-EACCES);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_get_name(struct dentry *parent, char *name,
|
|
||||||
struct dentry *child)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* ovl_fh_to_dentry() returns connected dir overlay dentries and
|
|
||||||
* ovl_fh_to_parent() is not implemented, so we should not get here.
|
|
||||||
*/
|
|
||||||
WARN_ON_ONCE(1);
|
|
||||||
return -EIO;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct dentry *ovl_get_parent(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* ovl_fh_to_dentry() returns connected dir overlay dentries, so we
|
|
||||||
* should not get here.
|
|
||||||
*/
|
|
||||||
WARN_ON_ONCE(1);
|
|
||||||
return ERR_PTR(-EIO);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
const struct export_operations ovl_export_operations = {
|
|
||||||
.encode_fh = ovl_encode_fh,
|
|
||||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0)
|
|
||||||
.fh_to_dentry = ovl_fh_to_dentry,
|
|
||||||
.fh_to_parent = ovl_fh_to_parent,
|
|
||||||
.get_name = ovl_get_name,
|
|
||||||
.get_parent = ovl_get_parent,
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
@@ -1,874 +0,0 @@
|
|||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/fs.h>
|
|
||||||
#include <linux/slab.h>
|
|
||||||
#include <linux/cred.h>
|
|
||||||
#include <linux/xattr.h>
|
|
||||||
#include <linux/posix_acl.h>
|
|
||||||
#include <linux/ratelimit.h>
|
|
||||||
#include <linux/version.h>
|
|
||||||
#include "overlayfs.h"
|
|
||||||
|
|
||||||
|
|
||||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *upperdentry;
|
|
||||||
const struct cred *old_cred;
|
|
||||||
|
|
||||||
/* NOCOPYUPW */
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Check for permissions before trying to copy-up. This is redundant
|
|
||||||
* since it will be rechecked later by ->setattr() on upper dentry. But
|
|
||||||
* without this, copy-up can be triggered by just about anybody.
|
|
||||||
*
|
|
||||||
* We don't initialize inode->size, which just means that
|
|
||||||
* inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not
|
|
||||||
* check for a swapfile (which this won't be anyway).
|
|
||||||
*/
|
|
||||||
err = setattr_prepare(dentry, attr);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
err = ovl_copy_up(dentry);
|
|
||||||
if (!err) {
|
|
||||||
upperdentry = ovl_dentry_upper(dentry);
|
|
||||||
|
|
||||||
if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
|
|
||||||
attr->ia_valid &= ~ATTR_MODE;
|
|
||||||
|
|
||||||
inode_lock(upperdentry->d_inode);
|
|
||||||
old_cred = ovl_override_creds(dentry->d_sb);
|
|
||||||
err = notify_change(upperdentry, attr, NULL);
|
|
||||||
revert_creds(old_cred);
|
|
||||||
if (!err)
|
|
||||||
ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
|
|
||||||
inode_unlock(upperdentry->d_inode);
|
|
||||||
}
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat,
|
|
||||||
struct ovl_layer *lower_layer)
|
|
||||||
{
|
|
||||||
bool samefs = ovl_same_sb(dentry->d_sb);
|
|
||||||
unsigned int xinobits = ovl_xino_bits(dentry->d_sb);
|
|
||||||
|
|
||||||
if (samefs) {
|
|
||||||
/*
|
|
||||||
* When all layers are on the same fs, all real inode
|
|
||||||
* number are unique, so we use the overlay st_dev,
|
|
||||||
* which is friendly to du -x.
|
|
||||||
*/
|
|
||||||
stat->dev = dentry->d_sb->s_dev;
|
|
||||||
return 0;
|
|
||||||
} else if (xinobits) {
|
|
||||||
unsigned int shift = 64 - xinobits;
|
|
||||||
/*
|
|
||||||
* All inode numbers of underlying fs should not be using the
|
|
||||||
* high xinobits, so we use high xinobits to partition the
|
|
||||||
* overlay st_ino address space. The high bits holds the fsid
|
|
||||||
* (upper fsid is 0). This way overlay inode numbers are unique
|
|
||||||
* and all inodes use overlay st_dev. Inode numbers are also
|
|
||||||
* persistent for a given layer configuration.
|
|
||||||
*/
|
|
||||||
if (stat->ino >> shift) {
|
|
||||||
pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
|
|
||||||
dentry, stat->ino, xinobits);
|
|
||||||
} else {
|
|
||||||
if (lower_layer)
|
|
||||||
stat->ino |= ((u64)lower_layer->fsid) << shift;
|
|
||||||
|
|
||||||
stat->dev = dentry->d_sb->s_dev;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* The inode could not be mapped to a unified st_ino address space */
|
|
||||||
if (S_ISDIR(dentry->d_inode->i_mode)) {
|
|
||||||
/*
|
|
||||||
* Always use the overlay st_dev for directories, so 'find
|
|
||||||
* -xdev' will scan the entire overlay mount and won't cross the
|
|
||||||
* overlay mount boundaries.
|
|
||||||
*
|
|
||||||
* If not all layers are on the same fs the pair {real st_ino;
|
|
||||||
* overlay st_dev} is not unique, so use the non persistent
|
|
||||||
* overlay st_ino for directories.
|
|
||||||
*/
|
|
||||||
stat->dev = dentry->d_sb->s_dev;
|
|
||||||
stat->ino = dentry->d_inode->i_ino;
|
|
||||||
} else if (lower_layer && lower_layer->fsid) {
|
|
||||||
/*
|
|
||||||
* For non-samefs setup, if we cannot map all layers st_ino
|
|
||||||
* to a unified address space, we need to make sure that st_dev
|
|
||||||
* is unique per lower fs. Upper layer uses real st_dev and
|
|
||||||
* lower layers use the unique anonymous bdev assigned to the
|
|
||||||
* lower fs.
|
|
||||||
*/
|
|
||||||
stat->dev = lower_layer->fs->pseudo_dev;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_getattr(const struct path *path, struct kstat *stat,
|
|
||||||
u32 request_mask, unsigned int flags)
|
|
||||||
{
|
|
||||||
struct dentry *dentry = path->dentry;
|
|
||||||
enum ovl_path_type type;
|
|
||||||
struct path realpath;
|
|
||||||
const struct cred *old_cred;
|
|
||||||
bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
|
|
||||||
bool samefs = ovl_same_sb(dentry->d_sb);
|
|
||||||
struct ovl_layer *lower_layer = NULL;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
type = ovl_path_real(dentry, &realpath);
|
|
||||||
old_cred = ovl_override_creds(dentry->d_sb);
|
|
||||||
err = vfs_getattr(&realpath, stat, request_mask, flags);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* For non-dir or same fs, we use st_ino of the copy up origin.
|
|
||||||
* This guaranties constant st_dev/st_ino across copy up.
|
|
||||||
* With xino feature and non-samefs, we use st_ino of the copy up
|
|
||||||
* origin masked with high bits that represent the layer id.
|
|
||||||
*
|
|
||||||
* If lower filesystem supports NFS file handles, this also guaranties
|
|
||||||
* persistent st_ino across mount cycle.
|
|
||||||
*/
|
|
||||||
if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) {
|
|
||||||
if (!OVL_TYPE_UPPER(type)) {
|
|
||||||
lower_layer = ovl_layer_lower(dentry);
|
|
||||||
} else if (OVL_TYPE_ORIGIN(type)) {
|
|
||||||
struct kstat lowerstat;
|
|
||||||
u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0);
|
|
||||||
|
|
||||||
ovl_path_lower(dentry, &realpath);
|
|
||||||
err = vfs_getattr(&realpath, &lowerstat,
|
|
||||||
lowermask, flags);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Lower hardlinks may be broken on copy up to different
|
|
||||||
* upper files, so we cannot use the lower origin st_ino
|
|
||||||
* for those different files, even for the same fs case.
|
|
||||||
*
|
|
||||||
* Similarly, several redirected dirs can point to the
|
|
||||||
* same dir on a lower layer. With the "verify_lower"
|
|
||||||
* feature, we do not use the lower origin st_ino, if
|
|
||||||
* we haven't verified that this redirect is unique.
|
|
||||||
*
|
|
||||||
* With inodes index enabled, it is safe to use st_ino
|
|
||||||
* of an indexed origin. The index validates that the
|
|
||||||
* upper hardlink is not broken and that a redirected
|
|
||||||
* dir is the only redirect to that origin.
|
|
||||||
*/
|
|
||||||
if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) ||
|
|
||||||
(!ovl_verify_lower(dentry->d_sb) &&
|
|
||||||
(is_dir || lowerstat.nlink == 1))) {
|
|
||||||
stat->ino = lowerstat.ino;
|
|
||||||
lower_layer = ovl_layer_lower(dentry);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ovl_map_dev_ino(dentry, stat, lower_layer);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* It's probably not worth it to count subdirs to get the
|
|
||||||
* correct link count. nlink=1 seems to pacify 'find' and
|
|
||||||
* other utilities.
|
|
||||||
*/
|
|
||||||
if (is_dir && OVL_TYPE_MERGE(type))
|
|
||||||
stat->nlink = 1;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Return the overlay inode nlinks for indexed upper inodes.
|
|
||||||
* Overlay inode nlink counts the union of the upper hardlinks
|
|
||||||
* and non-covered lower hardlinks. It does not include the upper
|
|
||||||
* index hardlink.
|
|
||||||
*/
|
|
||||||
if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry)))
|
|
||||||
stat->nlink = dentry->d_inode->i_nlink;
|
|
||||||
|
|
||||||
out:
|
|
||||||
revert_creds(old_cred);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_permission(struct inode *inode, int mask)
|
|
||||||
{
|
|
||||||
struct inode *upperinode = ovl_inode_upper(inode);
|
|
||||||
struct inode *realinode = upperinode ?: ovl_inode_lower(inode);
|
|
||||||
const struct cred *old_cred;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
/* Careful in RCU walk mode */
|
|
||||||
if (!realinode) {
|
|
||||||
WARN_ON(!(mask & MAY_NOT_BLOCK));
|
|
||||||
return -ECHILD;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Check overlay inode with the creds of task and underlying inode
|
|
||||||
* with creds of mounter
|
|
||||||
*/
|
|
||||||
err = generic_permission(inode, mask);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
old_cred = ovl_override_creds(inode->i_sb);
|
|
||||||
if (!upperinode &&
|
|
||||||
!special_file(realinode->i_mode) && mask & MAY_WRITE) {
|
|
||||||
mask &= ~(MAY_WRITE | MAY_APPEND);
|
|
||||||
/* Make sure mounter can read file for copy up later */
|
|
||||||
mask |= MAY_READ;
|
|
||||||
}
|
|
||||||
err = inode_permission(realinode, mask);
|
|
||||||
revert_creds(old_cred);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
|
|
||||||
{
|
|
||||||
struct path realpath;
|
|
||||||
struct inode *realinode;
|
|
||||||
|
|
||||||
ovl_path_real(dentry, &realpath);
|
|
||||||
realinode = realpath.dentry->d_inode;
|
|
||||||
|
|
||||||
if (!realinode->i_op->readlink)
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
touch_atime(&realpath);
|
|
||||||
|
|
||||||
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char *ovl_get_link(struct dentry *dentry,
|
|
||||||
struct inode *inode,
|
|
||||||
struct delayed_call *done)
|
|
||||||
{
|
|
||||||
const struct cred *old_cred;
|
|
||||||
const char *p;
|
|
||||||
|
|
||||||
if (!dentry)
|
|
||||||
return ERR_PTR(-ECHILD);
|
|
||||||
|
|
||||||
old_cred = ovl_override_creds(dentry->d_sb);
|
|
||||||
p = vfs_get_link(ovl_dentry_real(dentry), done);
|
|
||||||
revert_creds(old_cred);
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ovl_is_private_xattr(const char *name)
|
|
||||||
{
|
|
||||||
return strncmp(name, OVL_XATTR_PREFIX,
|
|
||||||
sizeof(OVL_XATTR_PREFIX) - 1) == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
|
|
||||||
const void *value, size_t size, int flags)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct dentry *upperdentry = ovl_i_dentry_upper(inode);
|
|
||||||
struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
|
|
||||||
const struct cred *old_cred;
|
|
||||||
|
|
||||||
/* NOCOPYUPW */
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
if (!value && !upperdentry) {
|
|
||||||
err = vfs_getxattr(realdentry, name, NULL, 0);
|
|
||||||
if (err < 0)
|
|
||||||
goto out_drop_write;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!upperdentry) {
|
|
||||||
err = ovl_copy_up(dentry);
|
|
||||||
if (err)
|
|
||||||
goto out_drop_write;
|
|
||||||
|
|
||||||
realdentry = ovl_dentry_upper(dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
old_cred = ovl_override_creds(dentry->d_sb);
|
|
||||||
if (value)
|
|
||||||
err = vfs_setxattr(realdentry, name, value, size, flags);
|
|
||||||
else {
|
|
||||||
WARN_ON(flags != XATTR_REPLACE);
|
|
||||||
err = vfs_removexattr(realdentry, name);
|
|
||||||
}
|
|
||||||
revert_creds(old_cred);
|
|
||||||
|
|
||||||
out_drop_write:
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
|
|
||||||
void *value, size_t size)
|
|
||||||
{
|
|
||||||
ssize_t res;
|
|
||||||
const struct cred *old_cred;
|
|
||||||
struct dentry *realdentry =
|
|
||||||
ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry);
|
|
||||||
|
|
||||||
old_cred = ovl_override_creds(dentry->d_sb);
|
|
||||||
res = vfs_getxattr(realdentry, name, value, size);
|
|
||||||
revert_creds(old_cred);
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ovl_can_list(const char *s)
|
|
||||||
{
|
|
||||||
/* List all non-trusted xatts */
|
|
||||||
if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
/* Never list trusted.overlay, list other trusted for superuser only */
|
|
||||||
return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN);
|
|
||||||
}
|
|
||||||
|
|
||||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
|
|
||||||
{
|
|
||||||
struct dentry *realdentry = ovl_dentry_real(dentry);
|
|
||||||
ssize_t res;
|
|
||||||
size_t len;
|
|
||||||
char *s;
|
|
||||||
const struct cred *old_cred;
|
|
||||||
|
|
||||||
old_cred = ovl_override_creds(dentry->d_sb);
|
|
||||||
res = vfs_listxattr(realdentry, list, size);
|
|
||||||
revert_creds(old_cred);
|
|
||||||
if (res <= 0 || size == 0)
|
|
||||||
return res;
|
|
||||||
|
|
||||||
/* filter out private xattrs */
|
|
||||||
for (s = list, len = res; len;) {
|
|
||||||
size_t slen = strnlen(s, len) + 1;
|
|
||||||
|
|
||||||
/* underlying fs providing us with an broken xattr list? */
|
|
||||||
if (WARN_ON(slen > len))
|
|
||||||
return -EIO;
|
|
||||||
|
|
||||||
len -= slen;
|
|
||||||
if (!ovl_can_list(s)) {
|
|
||||||
res -= slen;
|
|
||||||
memmove(s, s + slen, len);
|
|
||||||
} else {
|
|
||||||
s += slen;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct posix_acl *ovl_get_acl(struct inode *inode, int type)
|
|
||||||
{
|
|
||||||
struct inode *realinode = ovl_inode_real(inode);
|
|
||||||
const struct cred *old_cred;
|
|
||||||
struct posix_acl *acl;
|
|
||||||
|
|
||||||
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
old_cred = ovl_override_creds(inode->i_sb);
|
|
||||||
acl = get_acl(realinode, type);
|
|
||||||
revert_creds(old_cred);
|
|
||||||
|
|
||||||
return acl;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ovl_open_need_copy_up(struct dentry *dentry, int flags)
|
|
||||||
{
|
|
||||||
/* Copy up of disconnected dentry does not set upper alias */
|
|
||||||
if (ovl_dentry_upper(dentry) &&
|
|
||||||
(ovl_dentry_has_upper_alias(dentry) ||
|
|
||||||
(dentry->d_flags & DCACHE_DISCONNECTED)))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (special_file(d_inode(dentry)->i_mode))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags)
|
|
||||||
{
|
|
||||||
int err = 0;
|
|
||||||
|
|
||||||
/* NOCOPYUPW */
|
|
||||||
return err;
|
|
||||||
|
|
||||||
if (ovl_open_need_copy_up(dentry, file_flags)) {
|
|
||||||
err = ovl_want_write(dentry);
|
|
||||||
if (!err) {
|
|
||||||
err = ovl_copy_up_flags(dentry, file_flags);
|
|
||||||
ovl_drop_write(dentry);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags)
|
|
||||||
{
|
|
||||||
if (flags & S_ATIME) {
|
|
||||||
struct ovl_fs *ofs = inode->i_sb->s_fs_info;
|
|
||||||
struct path upperpath = {
|
|
||||||
.mnt = ofs->upper_mnt,
|
|
||||||
.dentry = ovl_upperdentry_dereference(OVL_I(inode)),
|
|
||||||
};
|
|
||||||
|
|
||||||
if (upperpath.dentry) {
|
|
||||||
touch_atime(&upperpath);
|
|
||||||
inode->i_atime = d_inode(upperpath.dentry)->i_atime;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const struct inode_operations ovl_file_inode_operations = {
|
|
||||||
.setattr = ovl_setattr,
|
|
||||||
.permission = ovl_permission,
|
|
||||||
.getattr = ovl_getattr,
|
|
||||||
.listxattr = ovl_listxattr,
|
|
||||||
.get_acl = ovl_get_acl,
|
|
||||||
.update_time = ovl_update_time,
|
|
||||||
};
|
|
||||||
|
|
||||||
static const struct inode_operations ovl_symlink_inode_operations = {
|
|
||||||
.setattr = ovl_setattr,
|
|
||||||
.get_link = ovl_get_link,
|
|
||||||
.readlink = ovl_readlink,
|
|
||||||
.getattr = ovl_getattr,
|
|
||||||
.listxattr = ovl_listxattr,
|
|
||||||
.update_time = ovl_update_time,
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
|
||||||
* It is possible to stack overlayfs instance on top of another
|
|
||||||
* overlayfs instance as lower layer. We need to annonate the
|
|
||||||
* stackable i_mutex locks according to stack level of the super
|
|
||||||
* block instance. An overlayfs instance can never be in stack
|
|
||||||
* depth 0 (there is always a real fs below it). An overlayfs
|
|
||||||
* inode lock will use the lockdep annotaion ovl_i_mutex_key[depth].
|
|
||||||
*
|
|
||||||
* For example, here is a snip from /proc/lockdep_chains after
|
|
||||||
* dir_iterate of nested overlayfs:
|
|
||||||
*
|
|
||||||
* [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2)
|
|
||||||
* [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1)
|
|
||||||
* [...] &type->i_mutex_dir_key (stack_depth=0)
|
|
||||||
*/
|
|
||||||
#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH
|
|
||||||
|
|
||||||
static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
|
|
||||||
{
|
|
||||||
#ifdef CONFIG_LOCKDEP
|
|
||||||
static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING];
|
|
||||||
static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING];
|
|
||||||
static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING];
|
|
||||||
|
|
||||||
int depth = inode->i_sb->s_stack_depth - 1;
|
|
||||||
|
|
||||||
if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING))
|
|
||||||
depth = 0;
|
|
||||||
|
|
||||||
if (S_ISDIR(inode->i_mode))
|
|
||||||
lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]);
|
|
||||||
else
|
|
||||||
lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]);
|
|
||||||
|
|
||||||
lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev,
|
|
||||||
unsigned long ino, int fsid)
|
|
||||||
{
|
|
||||||
int xinobits = ovl_xino_bits(inode->i_sb);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* When NFS export is enabled and d_ino is consistent with st_ino
|
|
||||||
* (samefs or i_ino has enough bits to encode layer), set the same
|
|
||||||
* value used for d_ino to i_ino, because nfsd readdirplus compares
|
|
||||||
* d_ino values to i_ino values of child entries. When called from
|
|
||||||
* ovl_new_inode(), ino arg is 0, so i_ino will be updated to real
|
|
||||||
* upper inode i_ino on ovl_inode_init() or ovl_inode_update().
|
|
||||||
*/
|
|
||||||
if (inode->i_sb->s_export_op &&
|
|
||||||
(ovl_same_sb(inode->i_sb) || xinobits)) {
|
|
||||||
inode->i_ino = ino;
|
|
||||||
if (xinobits && fsid && !(ino >> (64 - xinobits)))
|
|
||||||
inode->i_ino |= (unsigned long)fsid << (64 - xinobits);
|
|
||||||
} else {
|
|
||||||
inode->i_ino = get_next_ino();
|
|
||||||
}
|
|
||||||
inode->i_mode = mode;
|
|
||||||
inode->i_flags |= S_NOCMTIME;
|
|
||||||
#ifdef CONFIG_FS_POSIX_ACL
|
|
||||||
inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ovl_lockdep_annotate_inode_mutex_key(inode);
|
|
||||||
|
|
||||||
switch (mode & S_IFMT) {
|
|
||||||
case S_IFREG:
|
|
||||||
inode->i_op = &ovl_file_inode_operations;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case S_IFDIR:
|
|
||||||
inode->i_op = &ovl_dir_inode_operations;
|
|
||||||
inode->i_fop = &ovl_dir_operations;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case S_IFLNK:
|
|
||||||
inode->i_op = &ovl_symlink_inode_operations;
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
inode->i_op = &ovl_file_inode_operations;
|
|
||||||
init_special_inode(inode, mode, rdev);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* With inodes index enabled, an overlay inode nlink counts the union of upper
|
|
||||||
* hardlinks and non-covered lower hardlinks. During the lifetime of a non-pure
|
|
||||||
* upper inode, the following nlink modifying operations can happen:
|
|
||||||
*
|
|
||||||
* 1. Lower hardlink copy up
|
|
||||||
* 2. Upper hardlink created, unlinked or renamed over
|
|
||||||
* 3. Lower hardlink whiteout or renamed over
|
|
||||||
*
|
|
||||||
* For the first, copy up case, the union nlink does not change, whether the
|
|
||||||
* operation succeeds or fails, but the upper inode nlink may change.
|
|
||||||
* Therefore, before copy up, we store the union nlink value relative to the
|
|
||||||
* lower inode nlink in the index inode xattr trusted.overlay.nlink.
|
|
||||||
*
|
|
||||||
* For the second, upper hardlink case, the union nlink should be incremented
|
|
||||||
* or decremented IFF the operation succeeds, aligned with nlink change of the
|
|
||||||
* upper inode. Therefore, before link/unlink/rename, we store the union nlink
|
|
||||||
* value relative to the upper inode nlink in the index inode.
|
|
||||||
*
|
|
||||||
* For the last, lower cover up case, we simplify things by preceding the
|
|
||||||
* whiteout or cover up with copy up. This makes sure that there is an index
|
|
||||||
* upper inode where the nlink xattr can be stored before the copied up upper
|
|
||||||
* entry is unlink.
|
|
||||||
*/
|
|
||||||
#define OVL_NLINK_ADD_UPPER (1 << 0)
|
|
||||||
|
|
||||||
/*
|
|
||||||
* On-disk format for indexed nlink:
|
|
||||||
*
|
|
||||||
* nlink relative to the upper inode - "U[+-]NUM"
|
|
||||||
* nlink relative to the lower inode - "L[+-]NUM"
|
|
||||||
*/
|
|
||||||
|
|
||||||
static int ovl_set_nlink_common(struct dentry *dentry,
|
|
||||||
struct dentry *realdentry, const char *format)
|
|
||||||
{
|
|
||||||
struct inode *inode = d_inode(dentry);
|
|
||||||
struct inode *realinode = d_inode(realdentry);
|
|
||||||
char buf[13];
|
|
||||||
int len;
|
|
||||||
|
|
||||||
len = snprintf(buf, sizeof(buf), format,
|
|
||||||
(int) (inode->i_nlink - realinode->i_nlink));
|
|
||||||
|
|
||||||
if (WARN_ON(len >= sizeof(buf)))
|
|
||||||
return -EIO;
|
|
||||||
|
|
||||||
return ovl_do_setxattr(ovl_dentry_upper(dentry),
|
|
||||||
OVL_XATTR_NLINK, buf, len, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_set_nlink_upper(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return ovl_set_nlink_common(dentry, ovl_dentry_upper(dentry), "U%+i");
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_set_nlink_lower(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i");
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int ovl_get_nlink(struct dentry *lowerdentry,
|
|
||||||
struct dentry *upperdentry,
|
|
||||||
unsigned int fallback)
|
|
||||||
{
|
|
||||||
int nlink_diff;
|
|
||||||
int nlink;
|
|
||||||
char buf[13];
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1)
|
|
||||||
return fallback;
|
|
||||||
|
|
||||||
err = vfs_getxattr(upperdentry, OVL_XATTR_NLINK, &buf, sizeof(buf) - 1);
|
|
||||||
if (err < 0)
|
|
||||||
goto fail;
|
|
||||||
|
|
||||||
buf[err] = '\0';
|
|
||||||
if ((buf[0] != 'L' && buf[0] != 'U') ||
|
|
||||||
(buf[1] != '+' && buf[1] != '-'))
|
|
||||||
goto fail;
|
|
||||||
|
|
||||||
err = kstrtoint(buf + 1, 10, &nlink_diff);
|
|
||||||
if (err < 0)
|
|
||||||
goto fail;
|
|
||||||
|
|
||||||
nlink = d_inode(buf[0] == 'L' ? lowerdentry : upperdentry)->i_nlink;
|
|
||||||
nlink += nlink_diff;
|
|
||||||
|
|
||||||
if (nlink <= 0)
|
|
||||||
goto fail;
|
|
||||||
|
|
||||||
return nlink;
|
|
||||||
|
|
||||||
fail:
|
|
||||||
pr_warn_ratelimited("overlayfs: failed to get index nlink (%pd2, err=%i)\n",
|
|
||||||
upperdentry, err);
|
|
||||||
return fallback;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
|
|
||||||
{
|
|
||||||
struct inode *inode;
|
|
||||||
|
|
||||||
inode = new_inode(sb);
|
|
||||||
if (inode)
|
|
||||||
ovl_fill_inode(inode, mode, rdev, 0, 0);
|
|
||||||
|
|
||||||
return inode;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_inode_test(struct inode *inode, void *data)
|
|
||||||
{
|
|
||||||
return inode->i_private == data;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_inode_set(struct inode *inode, void *data)
|
|
||||||
{
|
|
||||||
inode->i_private = data;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry,
|
|
||||||
struct dentry *upperdentry, bool strict)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* For directories, @strict verify from lookup path performs consistency
|
|
||||||
* checks, so NULL lower/upper in dentry must match NULL lower/upper in
|
|
||||||
* inode. Non @strict verify from NFS handle decode path passes NULL for
|
|
||||||
* 'unknown' lower/upper.
|
|
||||||
*/
|
|
||||||
if (S_ISDIR(inode->i_mode) && strict) {
|
|
||||||
/* Real lower dir moved to upper layer under us? */
|
|
||||||
if (!lowerdentry && ovl_inode_lower(inode))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
/* Lookup of an uncovered redirect origin? */
|
|
||||||
if (!upperdentry && ovl_inode_upper(inode))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL.
|
|
||||||
* This happens when finding a copied up overlay inode for a renamed
|
|
||||||
* or hardlinked overlay dentry and lower dentry cannot be followed
|
|
||||||
* by origin because lower fs does not support file handles.
|
|
||||||
*/
|
|
||||||
if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Allow non-NULL __upperdentry in inode even if upperdentry is NULL.
|
|
||||||
* This happens when finding a lower alias for a copied up hard link.
|
|
||||||
*/
|
|
||||||
if (upperdentry && ovl_inode_upper(inode) != d_inode(upperdentry))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
|
|
||||||
bool is_upper)
|
|
||||||
{
|
|
||||||
struct inode *inode, *key = d_inode(real);
|
|
||||||
|
|
||||||
inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key);
|
|
||||||
if (!inode)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
if (!ovl_verify_inode(inode, is_upper ? NULL : real,
|
|
||||||
is_upper ? real : NULL, false)) {
|
|
||||||
iput(inode);
|
|
||||||
return ERR_PTR(-ESTALE);
|
|
||||||
}
|
|
||||||
|
|
||||||
return inode;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Does overlay inode need to be hashed by lower inode?
|
|
||||||
*/
|
|
||||||
static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
|
|
||||||
struct dentry *lower, struct dentry *index)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = sb->s_fs_info;
|
|
||||||
|
|
||||||
/* No, if pure upper */
|
|
||||||
if (!lower)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
/* Yes, if already indexed */
|
|
||||||
if (index)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
/* Yes, if won't be copied up */
|
|
||||||
if (!ofs->upper_mnt)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
/* No, if lower hardlink is or will be broken on copy up */
|
|
||||||
if ((upper || !ovl_indexdir(sb)) &&
|
|
||||||
!d_is_dir(lower) && d_inode(lower)->i_nlink > 1)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
/* No, if non-indexed upper with NFS export */
|
|
||||||
if (sb->s_export_op && upper)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
/* Otherwise, hash by lower inode for fsnotify */
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode,
|
|
||||||
struct inode *key)
|
|
||||||
{
|
|
||||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
|
|
||||||
return newinode ? inode_insert5(newinode, (unsigned long) key,
|
|
||||||
ovl_inode_test, ovl_inode_set, key) :
|
|
||||||
#else
|
|
||||||
return
|
|
||||||
#endif
|
|
||||||
iget5_locked(sb, (unsigned long) key,
|
|
||||||
ovl_inode_test, ovl_inode_set, key);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct inode *ovl_get_inode(struct super_block *sb,
|
|
||||||
struct ovl_inode_params *oip)
|
|
||||||
{
|
|
||||||
struct dentry *upperdentry = oip->upperdentry;
|
|
||||||
struct ovl_path *lowerpath = oip->lowerpath;
|
|
||||||
struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
|
|
||||||
struct inode *inode;
|
|
||||||
struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
|
|
||||||
bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
|
|
||||||
oip->index);
|
|
||||||
int fsid = bylower ? oip->lowerpath->layer->fsid : 0;
|
|
||||||
bool is_dir;
|
|
||||||
unsigned long ino = 0;
|
|
||||||
|
|
||||||
if (!realinode)
|
|
||||||
realinode = d_inode(lowerdentry);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Copy up origin (lower) may exist for non-indexed upper, but we must
|
|
||||||
* not use lower as hash key if this is a broken hardlink.
|
|
||||||
*/
|
|
||||||
is_dir = S_ISDIR(realinode->i_mode);
|
|
||||||
if (upperdentry || bylower) {
|
|
||||||
struct inode *key = d_inode(bylower ? lowerdentry :
|
|
||||||
upperdentry);
|
|
||||||
unsigned int nlink = is_dir ? 1 : realinode->i_nlink;
|
|
||||||
|
|
||||||
inode = ovl_iget5(sb, oip->newinode, key);
|
|
||||||
if (!inode)
|
|
||||||
goto out_nomem;
|
|
||||||
if (!(inode->i_state & I_NEW)) {
|
|
||||||
/*
|
|
||||||
* Verify that the underlying files stored in the inode
|
|
||||||
* match those in the dentry.
|
|
||||||
*/
|
|
||||||
if (!ovl_verify_inode(inode, lowerdentry, upperdentry,
|
|
||||||
true)) {
|
|
||||||
iput(inode);
|
|
||||||
inode = ERR_PTR(-ESTALE);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
dput(upperdentry);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Recalculate nlink for non-dir due to indexing */
|
|
||||||
if (!is_dir)
|
|
||||||
nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink);
|
|
||||||
set_nlink(inode, nlink);
|
|
||||||
ino = key->i_ino;
|
|
||||||
} else {
|
|
||||||
/* Lower hardlink that will be broken on copy up */
|
|
||||||
inode = new_inode(sb);
|
|
||||||
if (!inode)
|
|
||||||
goto out_nomem;
|
|
||||||
}
|
|
||||||
ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid);
|
|
||||||
ovl_inode_init(inode, upperdentry, lowerdentry);
|
|
||||||
|
|
||||||
if (upperdentry && ovl_is_impuredir(upperdentry))
|
|
||||||
ovl_set_flag(OVL_IMPURE, inode);
|
|
||||||
|
|
||||||
if (oip->index)
|
|
||||||
ovl_set_flag(OVL_INDEX, inode);
|
|
||||||
|
|
||||||
/* Check for non-merge dir that may have whiteouts */
|
|
||||||
if (is_dir) {
|
|
||||||
if (((upperdentry && lowerdentry) || oip->numlower > 1) ||
|
|
||||||
ovl_check_origin_xattr(upperdentry ?: lowerdentry)) {
|
|
||||||
ovl_set_flag(OVL_WHITEOUTS, inode);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (inode->i_state & I_NEW)
|
|
||||||
unlock_new_inode(inode);
|
|
||||||
out:
|
|
||||||
return inode;
|
|
||||||
|
|
||||||
out_nomem:
|
|
||||||
inode = ERR_PTR(-ENOMEM);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,381 +0,0 @@
|
|||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/kernel.h>
|
|
||||||
#include <linux/uuid.h>
|
|
||||||
#include "ovl_entry.h"
|
|
||||||
|
|
||||||
enum ovl_path_type {
|
|
||||||
__OVL_PATH_UPPER = (1 << 0),
|
|
||||||
__OVL_PATH_MERGE = (1 << 1),
|
|
||||||
__OVL_PATH_ORIGIN = (1 << 2),
|
|
||||||
};
|
|
||||||
|
|
||||||
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
|
|
||||||
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
|
|
||||||
#define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN)
|
|
||||||
|
|
||||||
#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay."
|
|
||||||
#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque"
|
|
||||||
#define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect"
|
|
||||||
#define OVL_XATTR_ORIGIN OVL_XATTR_PREFIX "origin"
|
|
||||||
#define OVL_XATTR_IMPURE OVL_XATTR_PREFIX "impure"
|
|
||||||
#define OVL_XATTR_NLINK OVL_XATTR_PREFIX "nlink"
|
|
||||||
#define OVL_XATTR_UPPER OVL_XATTR_PREFIX "upper"
|
|
||||||
|
|
||||||
enum ovl_inode_flag {
|
|
||||||
/* Pure upper dir that may contain non pure upper entries */
|
|
||||||
OVL_IMPURE,
|
|
||||||
/* Non-merge dir that may contain whiteout entries */
|
|
||||||
OVL_WHITEOUTS,
|
|
||||||
OVL_INDEX,
|
|
||||||
};
|
|
||||||
|
|
||||||
enum ovl_entry_flag {
|
|
||||||
OVL_E_UPPER_ALIAS,
|
|
||||||
OVL_E_OPAQUE,
|
|
||||||
OVL_E_CONNECTED,
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The tuple (fh,uuid) is a universal unique identifier for a copy up origin,
|
|
||||||
* where:
|
|
||||||
* origin.fh - exported file handle of the lower file
|
|
||||||
* origin.uuid - uuid of the lower filesystem
|
|
||||||
*/
|
|
||||||
#define OVL_FH_VERSION 0
|
|
||||||
#define OVL_FH_MAGIC 0xfb
|
|
||||||
|
|
||||||
/* CPU byte order required for fid decoding: */
|
|
||||||
#define OVL_FH_FLAG_BIG_ENDIAN (1 << 0)
|
|
||||||
#define OVL_FH_FLAG_ANY_ENDIAN (1 << 1)
|
|
||||||
/* Is the real inode encoded in fid an upper inode? */
|
|
||||||
#define OVL_FH_FLAG_PATH_UPPER (1 << 2)
|
|
||||||
|
|
||||||
#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN | \
|
|
||||||
OVL_FH_FLAG_PATH_UPPER)
|
|
||||||
|
|
||||||
#if defined(__LITTLE_ENDIAN)
|
|
||||||
#define OVL_FH_FLAG_CPU_ENDIAN 0
|
|
||||||
#elif defined(__BIG_ENDIAN)
|
|
||||||
#define OVL_FH_FLAG_CPU_ENDIAN OVL_FH_FLAG_BIG_ENDIAN
|
|
||||||
#else
|
|
||||||
#error Endianness not defined
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* The type returned by overlay exportfs ops when encoding an ovl_fh handle */
|
|
||||||
#define OVL_FILEID 0xfb
|
|
||||||
|
|
||||||
/* On-disk and in-memeory format for redirect by file handle */
|
|
||||||
struct ovl_fh {
|
|
||||||
u8 version; /* 0 */
|
|
||||||
u8 magic; /* 0xfb */
|
|
||||||
u8 len; /* size of this header + size of fid */
|
|
||||||
u8 flags; /* OVL_FH_FLAG_* */
|
|
||||||
u8 type; /* fid_type of fid */
|
|
||||||
uuid_t uuid; /* uuid of filesystem */
|
|
||||||
u8 fid[0]; /* file identifier */
|
|
||||||
} __packed;
|
|
||||||
|
|
||||||
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err = vfs_rmdir(dir, dentry);
|
|
||||||
|
|
||||||
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err = vfs_unlink(dir, dentry, NULL);
|
|
||||||
|
|
||||||
pr_debug("unlink(%pd2) = %i\n", dentry, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
|
|
||||||
struct dentry *new_dentry)
|
|
||||||
{
|
|
||||||
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
|
|
||||||
|
|
||||||
pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
|
|
||||||
umode_t mode)
|
|
||||||
{
|
|
||||||
int err = vfs_create(dir, dentry, mode, true);
|
|
||||||
|
|
||||||
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
|
|
||||||
umode_t mode)
|
|
||||||
{
|
|
||||||
int err = vfs_mkdir(dir, dentry, mode);
|
|
||||||
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
|
|
||||||
umode_t mode, dev_t dev)
|
|
||||||
{
|
|
||||||
int err = vfs_mknod(dir, dentry, mode, dev);
|
|
||||||
|
|
||||||
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
|
|
||||||
const char *oldname)
|
|
||||||
{
|
|
||||||
int err = vfs_symlink(dir, dentry, oldname);
|
|
||||||
|
|
||||||
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
|
|
||||||
const void *value, size_t size, int flags)
|
|
||||||
{
|
|
||||||
int err = vfs_setxattr(dentry, name, value, size, flags);
|
|
||||||
pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0x%x) = %i\n",
|
|
||||||
dentry, name, min((int)size, 48), value, size, flags, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
|
|
||||||
{
|
|
||||||
int err = vfs_removexattr(dentry, name);
|
|
||||||
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
|
|
||||||
struct inode *newdir, struct dentry *newdentry,
|
|
||||||
unsigned int flags)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags);
|
|
||||||
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
|
|
||||||
if (err) {
|
|
||||||
pr_debug("...rename(%pd2, %pd2, ...) = %i\n",
|
|
||||||
olddentry, newdentry, err);
|
|
||||||
}
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err = vfs_whiteout(dir, dentry);
|
|
||||||
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode)
|
|
||||||
{
|
|
||||||
struct dentry *ret = vfs_tmpfile(dentry, mode, 0);
|
|
||||||
int err = PTR_ERR_OR_ZERO(ret);
|
|
||||||
|
|
||||||
pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* util.c */
|
|
||||||
int ovl_want_write(struct dentry *dentry);
|
|
||||||
void ovl_drop_write(struct dentry *dentry);
|
|
||||||
struct dentry *ovl_workdir(struct dentry *dentry);
|
|
||||||
const struct cred *ovl_override_creds(struct super_block *sb);
|
|
||||||
struct super_block *ovl_same_sb(struct super_block *sb);
|
|
||||||
int ovl_can_decode_fh(struct super_block *sb);
|
|
||||||
struct dentry *ovl_indexdir(struct super_block *sb);
|
|
||||||
bool ovl_index_all(struct super_block *sb);
|
|
||||||
bool ovl_verify_lower(struct super_block *sb);
|
|
||||||
struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
|
|
||||||
bool ovl_dentry_remote(struct dentry *dentry);
|
|
||||||
bool ovl_dentry_weird(struct dentry *dentry);
|
|
||||||
enum ovl_path_type ovl_path_type(struct dentry *dentry);
|
|
||||||
void ovl_path_upper(struct dentry *dentry, struct path *path);
|
|
||||||
void ovl_path_lower(struct dentry *dentry, struct path *path);
|
|
||||||
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
|
|
||||||
struct dentry *ovl_dentry_upper(struct dentry *dentry);
|
|
||||||
struct dentry *ovl_dentry_lower(struct dentry *dentry);
|
|
||||||
struct ovl_layer *ovl_layer_lower(struct dentry *dentry);
|
|
||||||
struct dentry *ovl_dentry_real(struct dentry *dentry);
|
|
||||||
struct dentry *ovl_i_dentry_upper(struct inode *inode);
|
|
||||||
struct inode *ovl_inode_upper(struct inode *inode);
|
|
||||||
struct inode *ovl_inode_lower(struct inode *inode);
|
|
||||||
struct inode *ovl_inode_real(struct inode *inode);
|
|
||||||
struct ovl_dir_cache *ovl_dir_cache(struct inode *inode);
|
|
||||||
void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache);
|
|
||||||
void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry);
|
|
||||||
void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry);
|
|
||||||
bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry);
|
|
||||||
bool ovl_dentry_is_opaque(struct dentry *dentry);
|
|
||||||
bool ovl_dentry_is_whiteout(struct dentry *dentry);
|
|
||||||
void ovl_dentry_set_opaque(struct dentry *dentry);
|
|
||||||
bool ovl_dentry_has_upper_alias(struct dentry *dentry);
|
|
||||||
void ovl_dentry_set_upper_alias(struct dentry *dentry);
|
|
||||||
bool ovl_redirect_dir(struct super_block *sb);
|
|
||||||
const char *ovl_dentry_get_redirect(struct dentry *dentry);
|
|
||||||
void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
|
|
||||||
void ovl_inode_init(struct inode *inode, struct dentry *upperdentry,
|
|
||||||
struct dentry *lowerdentry);
|
|
||||||
void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
|
|
||||||
void ovl_dentry_version_inc(struct dentry *dentry, bool impurity);
|
|
||||||
u64 ovl_dentry_version_get(struct dentry *dentry);
|
|
||||||
bool ovl_is_whiteout(struct dentry *dentry);
|
|
||||||
struct file *ovl_path_open(struct path *path, int flags);
|
|
||||||
int ovl_copy_up_start(struct dentry *dentry);
|
|
||||||
void ovl_copy_up_end(struct dentry *dentry);
|
|
||||||
bool ovl_check_origin_xattr(struct dentry *dentry);
|
|
||||||
bool ovl_check_dir_xattr(struct dentry *dentry, const char *name);
|
|
||||||
int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
|
|
||||||
const char *name, const void *value, size_t size,
|
|
||||||
int xerr);
|
|
||||||
int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry);
|
|
||||||
void ovl_set_flag(unsigned long flag, struct inode *inode);
|
|
||||||
void ovl_clear_flag(unsigned long flag, struct inode *inode);
|
|
||||||
bool ovl_test_flag(unsigned long flag, struct inode *inode);
|
|
||||||
bool ovl_inuse_trylock(struct dentry *dentry);
|
|
||||||
void ovl_inuse_unlock(struct dentry *dentry);
|
|
||||||
bool ovl_need_index(struct dentry *dentry);
|
|
||||||
int ovl_nlink_start(struct dentry *dentry, bool *locked);
|
|
||||||
void ovl_nlink_end(struct dentry *dentry, bool locked);
|
|
||||||
int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
|
|
||||||
|
|
||||||
static inline bool ovl_is_impuredir(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline unsigned int ovl_xino_bits(struct super_block *sb)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = sb->s_fs_info;
|
|
||||||
|
|
||||||
return ofs->xino_bits;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* namei.c */
|
|
||||||
int ovl_check_fh_len(struct ovl_fh *fh, int fh_len);
|
|
||||||
struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt,
|
|
||||||
bool connected);
|
|
||||||
int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
|
|
||||||
struct dentry *upperdentry, struct ovl_path **stackp);
|
|
||||||
int ovl_verify_set_fh(struct dentry *dentry, const char *name,
|
|
||||||
struct dentry *real, bool is_upper, bool set);
|
|
||||||
struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index);
|
|
||||||
int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index);
|
|
||||||
int ovl_get_index_name(struct dentry *origin, struct qstr *name);
|
|
||||||
struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh);
|
|
||||||
struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
|
|
||||||
struct dentry *origin, bool verify);
|
|
||||||
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
|
|
||||||
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
|
|
||||||
unsigned int flags);
|
|
||||||
bool ovl_lower_positive(struct dentry *dentry);
|
|
||||||
|
|
||||||
static inline int ovl_verify_origin(struct dentry *upper,
|
|
||||||
struct dentry *origin, bool set)
|
|
||||||
{
|
|
||||||
return ovl_verify_set_fh(upper, OVL_XATTR_ORIGIN, origin, false, set);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int ovl_verify_upper(struct dentry *index,
|
|
||||||
struct dentry *upper, bool set)
|
|
||||||
{
|
|
||||||
return ovl_verify_set_fh(index, OVL_XATTR_UPPER, upper, true, set);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* readdir.c */
|
|
||||||
extern const struct file_operations ovl_dir_operations;
|
|
||||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
|
|
||||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
|
|
||||||
void ovl_cache_free(struct list_head *list);
|
|
||||||
void ovl_dir_cache_free(struct inode *inode);
|
|
||||||
int ovl_check_d_type_supported(struct path *realpath);
|
|
||||||
void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
|
|
||||||
struct dentry *dentry, int level);
|
|
||||||
int ovl_indexdir_cleanup(struct ovl_fs *ofs);
|
|
||||||
|
|
||||||
/* inode.c */
|
|
||||||
int ovl_set_nlink_upper(struct dentry *dentry);
|
|
||||||
int ovl_set_nlink_lower(struct dentry *dentry);
|
|
||||||
unsigned int ovl_get_nlink(struct dentry *lowerdentry,
|
|
||||||
struct dentry *upperdentry,
|
|
||||||
unsigned int fallback);
|
|
||||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
|
|
||||||
int ovl_getattr(const struct path *path, struct kstat *stat,
|
|
||||||
u32 request_mask, unsigned int flags);
|
|
||||||
int ovl_permission(struct inode *inode, int mask);
|
|
||||||
int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
|
|
||||||
const void *value, size_t size, int flags);
|
|
||||||
int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
|
|
||||||
void *value, size_t size);
|
|
||||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
|
|
||||||
struct posix_acl *ovl_get_acl(struct inode *inode, int type);
|
|
||||||
int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
|
|
||||||
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
|
|
||||||
bool ovl_is_private_xattr(const char *name);
|
|
||||||
|
|
||||||
struct ovl_inode_params {
|
|
||||||
struct inode *newinode;
|
|
||||||
struct dentry *upperdentry;
|
|
||||||
struct ovl_path *lowerpath;
|
|
||||||
struct dentry *index;
|
|
||||||
unsigned int numlower;
|
|
||||||
};
|
|
||||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
|
|
||||||
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
|
|
||||||
bool is_upper);
|
|
||||||
struct inode *ovl_get_inode(struct super_block *sb,
|
|
||||||
struct ovl_inode_params *oip);
|
|
||||||
static inline void ovl_copyattr(struct inode *from, struct inode *to)
|
|
||||||
{
|
|
||||||
to->i_uid = from->i_uid;
|
|
||||||
to->i_gid = from->i_gid;
|
|
||||||
to->i_mode = from->i_mode;
|
|
||||||
to->i_atime = from->i_atime;
|
|
||||||
to->i_mtime = from->i_mtime;
|
|
||||||
to->i_ctime = from->i_ctime;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* dir.c */
|
|
||||||
extern const struct inode_operations ovl_dir_inode_operations;
|
|
||||||
int ovl_cleanup_and_whiteout(struct dentry *workdir, struct inode *dir,
|
|
||||||
struct dentry *dentry);
|
|
||||||
struct ovl_cattr {
|
|
||||||
dev_t rdev;
|
|
||||||
umode_t mode;
|
|
||||||
const char *link;
|
|
||||||
struct dentry *hardlink;
|
|
||||||
};
|
|
||||||
|
|
||||||
#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })
|
|
||||||
|
|
||||||
struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
|
||||||
struct ovl_cattr *attr);
|
|
||||||
int ovl_cleanup(struct inode *dir, struct dentry *dentry);
|
|
||||||
struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr);
|
|
||||||
|
|
||||||
/* copy_up.c */
|
|
||||||
int ovl_copy_up(struct dentry *dentry);
|
|
||||||
int ovl_copy_up_flags(struct dentry *dentry, int flags);
|
|
||||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
|
|
||||||
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
|
|
||||||
struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper);
|
|
||||||
int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
|
|
||||||
struct dentry *upper);
|
|
||||||
|
|
||||||
/* export.c */
|
|
||||||
extern const struct export_operations ovl_export_operations;
|
|
||||||
@@ -1,111 +0,0 @@
|
|||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
* Copyright (C) 2016 Red Hat, Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
struct ovl_config {
|
|
||||||
char *lowerdir;
|
|
||||||
char *upperdir;
|
|
||||||
char *workdir;
|
|
||||||
bool default_permissions;
|
|
||||||
bool redirect_dir;
|
|
||||||
bool redirect_follow;
|
|
||||||
const char *redirect_mode;
|
|
||||||
bool index;
|
|
||||||
bool nfs_export;
|
|
||||||
int xino;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ovl_sb {
|
|
||||||
struct super_block *sb;
|
|
||||||
dev_t pseudo_dev;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ovl_layer {
|
|
||||||
struct vfsmount *mnt;
|
|
||||||
struct ovl_sb *fs;
|
|
||||||
/* Index of this layer in fs root (upper idx == 0) */
|
|
||||||
int idx;
|
|
||||||
/* One fsid per unique underlying sb (upper fsid == 0) */
|
|
||||||
int fsid;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ovl_path {
|
|
||||||
struct ovl_layer *layer;
|
|
||||||
struct dentry *dentry;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* private information held for overlayfs's superblock */
|
|
||||||
struct ovl_fs {
|
|
||||||
struct vfsmount *upper_mnt;
|
|
||||||
unsigned int numlower;
|
|
||||||
/* Number of unique lower sb that differ from upper sb */
|
|
||||||
unsigned int numlowerfs;
|
|
||||||
struct ovl_layer *lower_layers;
|
|
||||||
struct ovl_sb *lower_fs;
|
|
||||||
/* workbasedir is the path at workdir= mount option */
|
|
||||||
struct dentry *workbasedir;
|
|
||||||
/* workdir is the 'work' directory under workbasedir */
|
|
||||||
struct dentry *workdir;
|
|
||||||
/* index directory listing overlay inodes by origin file handle */
|
|
||||||
struct dentry *indexdir;
|
|
||||||
long namelen;
|
|
||||||
/* pathnames of lower and upper dirs, for show_options */
|
|
||||||
struct ovl_config config;
|
|
||||||
/* creds of process who forced instantiation of super block */
|
|
||||||
const struct cred *creator_cred;
|
|
||||||
bool tmpfile;
|
|
||||||
bool noxattr;
|
|
||||||
/* Did we take the inuse lock? */
|
|
||||||
bool upperdir_locked;
|
|
||||||
bool workdir_locked;
|
|
||||||
/* Inode numbers in all layers do not use the high xino_bits */
|
|
||||||
unsigned int xino_bits;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* private information held for every overlayfs dentry */
|
|
||||||
struct ovl_entry {
|
|
||||||
union {
|
|
||||||
struct {
|
|
||||||
unsigned long flags;
|
|
||||||
};
|
|
||||||
struct rcu_head rcu;
|
|
||||||
};
|
|
||||||
unsigned numlower;
|
|
||||||
struct ovl_path lowerstack[];
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
|
|
||||||
|
|
||||||
static inline struct ovl_entry *OVL_E(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return (struct ovl_entry *) dentry->d_fsdata;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ovl_inode {
|
|
||||||
struct ovl_dir_cache *cache;
|
|
||||||
const char *redirect;
|
|
||||||
u64 version;
|
|
||||||
unsigned long flags;
|
|
||||||
struct inode vfs_inode;
|
|
||||||
struct dentry *__upperdentry;
|
|
||||||
struct inode *lower;
|
|
||||||
|
|
||||||
/* synchronize copy up and more */
|
|
||||||
struct mutex lock;
|
|
||||||
};
|
|
||||||
|
|
||||||
static inline struct ovl_inode *OVL_I(struct inode *inode)
|
|
||||||
{
|
|
||||||
return container_of(inode, struct ovl_inode, vfs_inode);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi)
|
|
||||||
{
|
|
||||||
return READ_ONCE(oi->__upperdentry);
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,680 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
* Copyright (C) 2016 Red Hat, Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/fs.h>
|
|
||||||
#include <linux/mount.h>
|
|
||||||
#include <linux/slab.h>
|
|
||||||
#include <linux/cred.h>
|
|
||||||
#include <linux/xattr.h>
|
|
||||||
#include <linux/exportfs.h>
|
|
||||||
#include <linux/uuid.h>
|
|
||||||
#include <linux/namei.h>
|
|
||||||
#include <linux/ratelimit.h>
|
|
||||||
#include "overlayfs.h"
|
|
||||||
|
|
||||||
int ovl_want_write(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
|
|
||||||
return mnt_want_write(ofs->upper_mnt);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_drop_write(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
|
|
||||||
mnt_drop_write(ofs->upper_mnt);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct dentry *ovl_workdir(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
|
|
||||||
return ofs->workdir;
|
|
||||||
}
|
|
||||||
|
|
||||||
const struct cred *ovl_override_creds(struct super_block *sb)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = sb->s_fs_info;
|
|
||||||
|
|
||||||
return override_creds(ofs->creator_cred);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct super_block *ovl_same_sb(struct super_block *sb)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = sb->s_fs_info;
|
|
||||||
|
|
||||||
if (!ofs->numlowerfs)
|
|
||||||
return ofs->upper_mnt->mnt_sb;
|
|
||||||
else if (ofs->numlowerfs == 1 && !ofs->upper_mnt)
|
|
||||||
return ofs->lower_fs[0].sb;
|
|
||||||
else
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Check if underlying fs supports file handles and try to determine encoding
|
|
||||||
* type, in order to deduce maximum inode number used by fs.
|
|
||||||
*
|
|
||||||
* Return 0 if file handles are not supported.
|
|
||||||
* Return 1 (FILEID_INO32_GEN) if fs uses the default 32bit inode encoding.
|
|
||||||
* Return -1 if fs uses a non default encoding with unknown inode size.
|
|
||||||
*/
|
|
||||||
int ovl_can_decode_fh(struct super_block *sb)
|
|
||||||
{
|
|
||||||
if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry ||
|
|
||||||
uuid_is_null(&sb->s_uuid))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct dentry *ovl_indexdir(struct super_block *sb)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = sb->s_fs_info;
|
|
||||||
|
|
||||||
return ofs->indexdir;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Index all files on copy up. For now only enabled for NFS export */
|
|
||||||
bool ovl_index_all(struct super_block *sb)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = sb->s_fs_info;
|
|
||||||
|
|
||||||
return ofs->config.nfs_export && ofs->config.index;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Verify lower origin on lookup. For now only enabled for NFS export */
|
|
||||||
bool ovl_verify_lower(struct super_block *sb)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = sb->s_fs_info;
|
|
||||||
|
|
||||||
return ofs->config.nfs_export && ofs->config.index;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
|
|
||||||
{
|
|
||||||
size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
|
|
||||||
struct ovl_entry *oe = kzalloc(size, GFP_KERNEL);
|
|
||||||
|
|
||||||
if (oe)
|
|
||||||
oe->numlower = numlower;
|
|
||||||
|
|
||||||
return oe;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ovl_dentry_remote(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return dentry->d_flags &
|
|
||||||
(DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE |
|
|
||||||
DCACHE_OP_REAL);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ovl_dentry_weird(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
/* NOFSCHECK */
|
|
||||||
return false;
|
|
||||||
return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
|
|
||||||
DCACHE_MANAGE_TRANSIT |
|
|
||||||
DCACHE_OP_HASH |
|
|
||||||
DCACHE_OP_COMPARE);
|
|
||||||
}
|
|
||||||
|
|
||||||
enum ovl_path_type ovl_path_type(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct ovl_entry *oe = dentry->d_fsdata;
|
|
||||||
enum ovl_path_type type = 0;
|
|
||||||
|
|
||||||
if (ovl_dentry_upper(dentry)) {
|
|
||||||
type = __OVL_PATH_UPPER;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Non-dir dentry can hold lower dentry of its copy up origin.
|
|
||||||
*/
|
|
||||||
if (oe->numlower) {
|
|
||||||
type |= __OVL_PATH_ORIGIN;
|
|
||||||
if (d_is_dir(dentry))
|
|
||||||
type |= __OVL_PATH_MERGE;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (oe->numlower > 1)
|
|
||||||
type |= __OVL_PATH_MERGE;
|
|
||||||
}
|
|
||||||
return type;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_path_upper(struct dentry *dentry, struct path *path)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
|
|
||||||
|
|
||||||
path->mnt = ofs->upper_mnt;
|
|
||||||
path->dentry = ovl_dentry_upper(dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_path_lower(struct dentry *dentry, struct path *path)
|
|
||||||
{
|
|
||||||
struct ovl_entry *oe = dentry->d_fsdata;
|
|
||||||
|
|
||||||
if (oe->numlower) {
|
|
||||||
path->mnt = oe->lowerstack[0].layer->mnt;
|
|
||||||
path->dentry = oe->lowerstack[0].dentry;
|
|
||||||
} else {
|
|
||||||
*path = (struct path) { };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
|
|
||||||
{
|
|
||||||
enum ovl_path_type type = ovl_path_type(dentry);
|
|
||||||
|
|
||||||
if (!OVL_TYPE_UPPER(type))
|
|
||||||
ovl_path_lower(dentry, path);
|
|
||||||
else
|
|
||||||
ovl_path_upper(dentry, path);
|
|
||||||
|
|
||||||
return type;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct dentry *ovl_dentry_upper(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return ovl_upperdentry_dereference(OVL_I(d_inode(dentry)));
|
|
||||||
}
|
|
||||||
|
|
||||||
struct dentry *ovl_dentry_lower(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct ovl_entry *oe = dentry->d_fsdata;
|
|
||||||
|
|
||||||
return oe->numlower ? oe->lowerstack[0].dentry : NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ovl_layer *ovl_layer_lower(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct ovl_entry *oe = dentry->d_fsdata;
|
|
||||||
|
|
||||||
return oe->numlower ? oe->lowerstack[0].layer : NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct dentry *ovl_dentry_real(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return ovl_dentry_upper(dentry) ?: ovl_dentry_lower(dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct dentry *ovl_i_dentry_upper(struct inode *inode)
|
|
||||||
{
|
|
||||||
return ovl_upperdentry_dereference(OVL_I(inode));
|
|
||||||
}
|
|
||||||
|
|
||||||
struct inode *ovl_inode_upper(struct inode *inode)
|
|
||||||
{
|
|
||||||
struct dentry *upperdentry = ovl_i_dentry_upper(inode);
|
|
||||||
|
|
||||||
return upperdentry ? d_inode(upperdentry) : NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct inode *ovl_inode_lower(struct inode *inode)
|
|
||||||
{
|
|
||||||
return OVL_I(inode)->lower;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct inode *ovl_inode_real(struct inode *inode)
|
|
||||||
{
|
|
||||||
return ovl_inode_upper(inode) ?: ovl_inode_lower(inode);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
struct ovl_dir_cache *ovl_dir_cache(struct inode *inode)
|
|
||||||
{
|
|
||||||
return OVL_I(inode)->cache;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache)
|
|
||||||
{
|
|
||||||
OVL_I(inode)->cache = cache;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
set_bit(flag, &OVL_E(dentry)->flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
clear_bit(flag, &OVL_E(dentry)->flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return test_bit(flag, &OVL_E(dentry)->flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ovl_dentry_is_opaque(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return ovl_dentry_test_flag(OVL_E_OPAQUE, dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ovl_dentry_is_whiteout(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return !dentry->d_inode && ovl_dentry_is_opaque(dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_dentry_set_opaque(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
ovl_dentry_set_flag(OVL_E_OPAQUE, dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* For hard links and decoded file handles, it's possible for ovl_dentry_upper()
|
|
||||||
* to return positive, while there's no actual upper alias for the inode.
|
|
||||||
* Copy up code needs to know about the existence of the upper alias, so it
|
|
||||||
* can't use ovl_dentry_upper().
|
|
||||||
*/
|
|
||||||
bool ovl_dentry_has_upper_alias(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return ovl_dentry_test_flag(OVL_E_UPPER_ALIAS, dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_dentry_set_upper_alias(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
ovl_dentry_set_flag(OVL_E_UPPER_ALIAS, dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ovl_redirect_dir(struct super_block *sb)
|
|
||||||
{
|
|
||||||
struct ovl_fs *ofs = sb->s_fs_info;
|
|
||||||
|
|
||||||
return ofs->config.redirect_dir && !ofs->noxattr;
|
|
||||||
}
|
|
||||||
|
|
||||||
const char *ovl_dentry_get_redirect(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
return OVL_I(d_inode(dentry))->redirect;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect)
|
|
||||||
{
|
|
||||||
struct ovl_inode *oi = OVL_I(d_inode(dentry));
|
|
||||||
|
|
||||||
kfree(oi->redirect);
|
|
||||||
oi->redirect = redirect;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_inode_init(struct inode *inode, struct dentry *upperdentry,
|
|
||||||
struct dentry *lowerdentry)
|
|
||||||
{
|
|
||||||
struct inode *realinode = d_inode(upperdentry ?: lowerdentry);
|
|
||||||
|
|
||||||
if (upperdentry)
|
|
||||||
OVL_I(inode)->__upperdentry = upperdentry;
|
|
||||||
if (lowerdentry)
|
|
||||||
OVL_I(inode)->lower = igrab(d_inode(lowerdentry));
|
|
||||||
|
|
||||||
ovl_copyattr(realinode, inode);
|
|
||||||
if (!inode->i_ino)
|
|
||||||
inode->i_ino = realinode->i_ino;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_inode_update(struct inode *inode, struct dentry *upperdentry)
|
|
||||||
{
|
|
||||||
struct inode *upperinode = d_inode(upperdentry);
|
|
||||||
|
|
||||||
WARN_ON(OVL_I(inode)->__upperdentry);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Make sure upperdentry is consistent before making it visible
|
|
||||||
*/
|
|
||||||
smp_wmb();
|
|
||||||
OVL_I(inode)->__upperdentry = upperdentry;
|
|
||||||
if (inode_unhashed(inode)) {
|
|
||||||
if (!inode->i_ino)
|
|
||||||
inode->i_ino = upperinode->i_ino;
|
|
||||||
inode->i_private = upperinode;
|
|
||||||
__insert_inode_hash(inode, (unsigned long) upperinode);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_dentry_version_inc(struct dentry *dentry, bool impurity)
|
|
||||||
{
|
|
||||||
struct inode *inode = d_inode(dentry);
|
|
||||||
|
|
||||||
WARN_ON(!inode_is_locked(inode));
|
|
||||||
/*
|
|
||||||
* Version is used by readdir code to keep cache consistent. For merge
|
|
||||||
* dirs all changes need to be noted. For non-merge dirs, cache only
|
|
||||||
* contains impure (ones which have been copied up and have origins)
|
|
||||||
* entries, so only need to note changes to impure entries.
|
|
||||||
*/
|
|
||||||
if (OVL_TYPE_MERGE(ovl_path_type(dentry)) || impurity)
|
|
||||||
OVL_I(inode)->version++;
|
|
||||||
}
|
|
||||||
|
|
||||||
u64 ovl_dentry_version_get(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct inode *inode = d_inode(dentry);
|
|
||||||
|
|
||||||
WARN_ON(!inode_is_locked(inode));
|
|
||||||
return OVL_I(inode)->version;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ovl_is_whiteout(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct inode *inode = dentry->d_inode;
|
|
||||||
|
|
||||||
return inode && IS_WHITEOUT(inode);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct file *ovl_path_open(struct path *path, int flags)
|
|
||||||
{
|
|
||||||
return dentry_open(path, flags | O_NOATIME, current_cred());
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_copy_up_start(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct ovl_inode *oi = OVL_I(d_inode(dentry));
|
|
||||||
int err;
|
|
||||||
|
|
||||||
err = mutex_lock_interruptible(&oi->lock);
|
|
||||||
if (!err && ovl_dentry_has_upper_alias(dentry)) {
|
|
||||||
err = 1; /* Already copied up */
|
|
||||||
mutex_unlock(&oi->lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_copy_up_end(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
mutex_unlock(&OVL_I(d_inode(dentry))->lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ovl_check_origin_xattr(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int res;
|
|
||||||
|
|
||||||
res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0);
|
|
||||||
|
|
||||||
/* Zero size value means "copied up but origin unknown" */
|
|
||||||
if (res >= 0)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ovl_check_dir_xattr(struct dentry *dentry, const char *name)
|
|
||||||
{
|
|
||||||
int res;
|
|
||||||
char val;
|
|
||||||
|
|
||||||
if (!d_is_dir(dentry))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
res = vfs_getxattr(dentry, name, &val, 1);
|
|
||||||
if (res == 1 && val == 'y')
|
|
||||||
return true;
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
|
|
||||||
const char *name, const void *value, size_t size,
|
|
||||||
int xerr)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
|
|
||||||
|
|
||||||
if (ofs->noxattr)
|
|
||||||
return xerr;
|
|
||||||
|
|
||||||
err = ovl_do_setxattr(upperdentry, name, value, size, 0);
|
|
||||||
|
|
||||||
if (err == -EOPNOTSUPP) {
|
|
||||||
pr_warn("overlayfs: cannot set %s xattr on upper\n", name);
|
|
||||||
ofs->noxattr = true;
|
|
||||||
return xerr;
|
|
||||||
}
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (ovl_test_flag(OVL_IMPURE, d_inode(dentry)))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Do not fail when upper doesn't support xattrs.
|
|
||||||
* Upper inodes won't have origin nor redirect xattr anyway.
|
|
||||||
*/
|
|
||||||
err = ovl_check_setxattr(dentry, upperdentry, OVL_XATTR_IMPURE,
|
|
||||||
"y", 1, 0);
|
|
||||||
if (!err)
|
|
||||||
ovl_set_flag(OVL_IMPURE, d_inode(dentry));
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_set_flag(unsigned long flag, struct inode *inode)
|
|
||||||
{
|
|
||||||
set_bit(flag, &OVL_I(inode)->flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_clear_flag(unsigned long flag, struct inode *inode)
|
|
||||||
{
|
|
||||||
clear_bit(flag, &OVL_I(inode)->flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ovl_test_flag(unsigned long flag, struct inode *inode)
|
|
||||||
{
|
|
||||||
return test_bit(flag, &OVL_I(inode)->flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Caller must hold a reference to inode to prevent it from being freed while
|
|
||||||
* it is marked inuse.
|
|
||||||
*/
|
|
||||||
bool ovl_inuse_trylock(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct inode *inode = d_inode(dentry);
|
|
||||||
bool locked = false;
|
|
||||||
|
|
||||||
spin_lock(&inode->i_lock);
|
|
||||||
if (!(inode->i_state & I_OVL_INUSE)) {
|
|
||||||
inode->i_state |= I_OVL_INUSE;
|
|
||||||
locked = true;
|
|
||||||
}
|
|
||||||
spin_unlock(&inode->i_lock);
|
|
||||||
|
|
||||||
return locked;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_inuse_unlock(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
if (dentry) {
|
|
||||||
struct inode *inode = d_inode(dentry);
|
|
||||||
|
|
||||||
spin_lock(&inode->i_lock);
|
|
||||||
WARN_ON(!(inode->i_state & I_OVL_INUSE));
|
|
||||||
inode->i_state &= ~I_OVL_INUSE;
|
|
||||||
spin_unlock(&inode->i_lock);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Does this overlay dentry need to be indexed on copy up?
|
|
||||||
*/
|
|
||||||
bool ovl_need_index(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct dentry *lower = ovl_dentry_lower(dentry);
|
|
||||||
|
|
||||||
if (!lower || !ovl_indexdir(dentry->d_sb))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
/* Index all files for NFS export and consistency verification */
|
|
||||||
if (ovl_index_all(dentry->d_sb))
|
|
||||||
return true;
|
|
||||||
|
|
||||||
/* Index only lower hardlinks on copy up */
|
|
||||||
if (!d_is_dir(lower) && d_inode(lower)->i_nlink > 1)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Caller must hold OVL_I(inode)->lock */
|
|
||||||
static void ovl_cleanup_index(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
|
|
||||||
struct inode *dir = indexdir->d_inode;
|
|
||||||
struct dentry *lowerdentry = ovl_dentry_lower(dentry);
|
|
||||||
struct dentry *upperdentry = ovl_dentry_upper(dentry);
|
|
||||||
struct dentry *index = NULL;
|
|
||||||
struct inode *inode;
|
|
||||||
struct qstr name = { };
|
|
||||||
int err;
|
|
||||||
|
|
||||||
err = ovl_get_index_name(lowerdentry, &name);
|
|
||||||
if (err)
|
|
||||||
goto fail;
|
|
||||||
|
|
||||||
inode = d_inode(upperdentry);
|
|
||||||
if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) {
|
|
||||||
pr_warn_ratelimited("overlayfs: cleanup linked index (%pd2, ino=%lu, nlink=%u)\n",
|
|
||||||
upperdentry, inode->i_ino, inode->i_nlink);
|
|
||||||
/*
|
|
||||||
* We either have a bug with persistent union nlink or a lower
|
|
||||||
* hardlink was added while overlay is mounted. Adding a lower
|
|
||||||
* hardlink and then unlinking all overlay hardlinks would drop
|
|
||||||
* overlay nlink to zero before all upper inodes are unlinked.
|
|
||||||
* As a safety measure, when that situation is detected, set
|
|
||||||
* the overlay nlink to the index inode nlink minus one for the
|
|
||||||
* index entry itself.
|
|
||||||
*/
|
|
||||||
set_nlink(d_inode(dentry), inode->i_nlink - 1);
|
|
||||||
ovl_set_nlink_upper(dentry);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
inode_lock_nested(dir, I_MUTEX_PARENT);
|
|
||||||
index = lookup_one_len(name.name, indexdir, name.len);
|
|
||||||
err = PTR_ERR(index);
|
|
||||||
if (IS_ERR(index)) {
|
|
||||||
index = NULL;
|
|
||||||
} else if (ovl_index_all(dentry->d_sb)) {
|
|
||||||
/* Whiteout orphan index to block future open by handle */
|
|
||||||
err = ovl_cleanup_and_whiteout(indexdir, dir, index);
|
|
||||||
} else {
|
|
||||||
/* Cleanup orphan index entries */
|
|
||||||
err = ovl_cleanup(dir, index);
|
|
||||||
}
|
|
||||||
|
|
||||||
inode_unlock(dir);
|
|
||||||
if (err)
|
|
||||||
goto fail;
|
|
||||||
|
|
||||||
out:
|
|
||||||
kfree(name.name);
|
|
||||||
dput(index);
|
|
||||||
return;
|
|
||||||
|
|
||||||
fail:
|
|
||||||
pr_err("overlayfs: cleanup index of '%pd2' failed (%i)\n", dentry, err);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Operations that change overlay inode and upper inode nlink need to be
|
|
||||||
* synchronized with copy up for persistent nlink accounting.
|
|
||||||
*/
|
|
||||||
int ovl_nlink_start(struct dentry *dentry, bool *locked)
|
|
||||||
{
|
|
||||||
struct ovl_inode *oi = OVL_I(d_inode(dentry));
|
|
||||||
const struct cred *old_cred;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (!d_inode(dentry))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* With inodes index is enabled, we store the union overlay nlink
|
|
||||||
* in an xattr on the index inode. When whiting out an indexed lower,
|
|
||||||
* we need to decrement the overlay persistent nlink, but before the
|
|
||||||
* first copy up, we have no upper index inode to store the xattr.
|
|
||||||
*
|
|
||||||
* As a workaround, before whiteout/rename over an indexed lower,
|
|
||||||
* copy up to create the upper index. Creating the upper index will
|
|
||||||
* initialize the overlay nlink, so it could be dropped if unlink
|
|
||||||
* or rename succeeds.
|
|
||||||
*
|
|
||||||
* TODO: implement metadata only index copy up when called with
|
|
||||||
* ovl_copy_up_flags(dentry, O_PATH).
|
|
||||||
*/
|
|
||||||
if (ovl_need_index(dentry) && !ovl_dentry_has_upper_alias(dentry)) {
|
|
||||||
err = ovl_copy_up(dentry);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = mutex_lock_interruptible(&oi->lock);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, d_inode(dentry)))
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
old_cred = ovl_override_creds(dentry->d_sb);
|
|
||||||
/*
|
|
||||||
* The overlay inode nlink should be incremented/decremented IFF the
|
|
||||||
* upper operation succeeds, along with nlink change of upper inode.
|
|
||||||
* Therefore, before link/unlink/rename, we store the union nlink
|
|
||||||
* value relative to the upper inode nlink in an upper inode xattr.
|
|
||||||
*/
|
|
||||||
err = ovl_set_nlink_upper(dentry);
|
|
||||||
revert_creds(old_cred);
|
|
||||||
|
|
||||||
out:
|
|
||||||
if (err)
|
|
||||||
mutex_unlock(&oi->lock);
|
|
||||||
else
|
|
||||||
*locked = true;
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ovl_nlink_end(struct dentry *dentry, bool locked)
|
|
||||||
{
|
|
||||||
if (locked) {
|
|
||||||
if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) &&
|
|
||||||
d_inode(dentry)->i_nlink == 0) {
|
|
||||||
const struct cred *old_cred;
|
|
||||||
|
|
||||||
old_cred = ovl_override_creds(dentry->d_sb);
|
|
||||||
ovl_cleanup_index(dentry);
|
|
||||||
revert_creds(old_cred);
|
|
||||||
}
|
|
||||||
|
|
||||||
mutex_unlock(&OVL_I(d_inode(dentry))->lock);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir)
|
|
||||||
{
|
|
||||||
/* Workdir should not be the same as upperdir */
|
|
||||||
if (workdir == upperdir)
|
|
||||||
goto err;
|
|
||||||
|
|
||||||
/* Workdir should not be subdir of upperdir and vice versa */
|
|
||||||
if (lock_rename(workdir, upperdir) != NULL)
|
|
||||||
goto err_unlock;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
err_unlock:
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
err:
|
|
||||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
|
||||||
return -EIO;
|
|
||||||
}
|
|
||||||
@@ -1,460 +0,0 @@
|
|||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2011 Novell Inc.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 as published by
|
|
||||||
* the Free Software Foundation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/module.h>
|
|
||||||
#include <linux/fs.h>
|
|
||||||
#include <linux/slab.h>
|
|
||||||
#include <linux/file.h>
|
|
||||||
#include <linux/splice.h>
|
|
||||||
#include <linux/xattr.h>
|
|
||||||
#include <linux/security.h>
|
|
||||||
#include <linux/uaccess.h>
|
|
||||||
#include <linux/sched.h>
|
|
||||||
#include <linux/namei.h>
|
|
||||||
#include <linux/fdtable.h>
|
|
||||||
#include <linux/ratelimit.h>
|
|
||||||
#include "overlayfs.h"
|
|
||||||
|
|
||||||
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
|
|
||||||
|
|
||||||
static bool __read_mostly ovl_check_copy_up;
|
|
||||||
module_param_named(check_copy_up, ovl_check_copy_up, bool,
|
|
||||||
S_IWUSR | S_IRUGO);
|
|
||||||
MODULE_PARM_DESC(ovl_check_copy_up,
|
|
||||||
"Warn on copy-up when causing process also has a R/O fd open");
|
|
||||||
|
|
||||||
static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
|
|
||||||
{
|
|
||||||
const struct dentry *dentry = data;
|
|
||||||
|
|
||||||
if (f->f_inode == d_inode(dentry))
|
|
||||||
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
|
|
||||||
f, fd, current->pid, current->comm);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Check the fds open by this process and warn if something like the following
|
|
||||||
* scenario is about to occur:
|
|
||||||
*
|
|
||||||
* fd1 = open("foo", O_RDONLY);
|
|
||||||
* fd2 = open("foo", O_RDWR);
|
|
||||||
*/
|
|
||||||
static void ovl_do_check_copy_up(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
if (ovl_check_copy_up)
|
|
||||||
iterate_fd(current->files, 0, ovl_check_fd, dentry);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new, unsigned opt)
|
|
||||||
{
|
|
||||||
ssize_t list_size, size, value_size = 0;
|
|
||||||
char *buf, *name, *value = NULL;
|
|
||||||
int uninitialized_var(error);
|
|
||||||
|
|
||||||
if (!old->d_inode->i_op->getxattr ||
|
|
||||||
!new->d_inode->i_op->getxattr)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
list_size = vfs_listxattr(old, NULL, 0);
|
|
||||||
if (list_size <= 0) {
|
|
||||||
if (list_size == -EOPNOTSUPP)
|
|
||||||
return 0;
|
|
||||||
return list_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
buf = kzalloc(list_size, GFP_KERNEL);
|
|
||||||
if (!buf)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
list_size = vfs_listxattr(old, buf, list_size);
|
|
||||||
if (list_size <= 0) {
|
|
||||||
error = list_size;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
|
|
||||||
retry:
|
|
||||||
size = vfs_getxattr(old, name, value, value_size);
|
|
||||||
if (size == -ERANGE)
|
|
||||||
size = vfs_getxattr(old, name, NULL, 0);
|
|
||||||
|
|
||||||
if (size < 0) {
|
|
||||||
if (OVL_OPT_NOFSCHECK(opt)) {
|
|
||||||
OVL_DEBUG("fail: old=%pd4, i_ino=%lu, name=%s\n",
|
|
||||||
old, old->d_inode->i_ino, name);
|
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
error = size;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
OVL_DEBUG("success: old=%pd4, i_ino=%lu, name=%s\n",
|
|
||||||
old, old->d_inode->i_ino, name);
|
|
||||||
|
|
||||||
if (size > value_size) {
|
|
||||||
void *new;
|
|
||||||
|
|
||||||
new = krealloc(value, size, GFP_KERNEL);
|
|
||||||
if (!new) {
|
|
||||||
error = -ENOMEM;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
value = new;
|
|
||||||
value_size = size;
|
|
||||||
goto retry;
|
|
||||||
}
|
|
||||||
|
|
||||||
error = vfs_setxattr(new, name, value, size, 0);
|
|
||||||
if (error)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
kfree(value);
|
|
||||||
out:
|
|
||||||
kfree(buf);
|
|
||||||
return error;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
|
|
||||||
{
|
|
||||||
struct file *old_file;
|
|
||||||
struct file *new_file;
|
|
||||||
loff_t old_pos = 0;
|
|
||||||
loff_t new_pos = 0;
|
|
||||||
int error = 0;
|
|
||||||
|
|
||||||
if (len == 0)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY);
|
|
||||||
if (IS_ERR(old_file))
|
|
||||||
return PTR_ERR(old_file);
|
|
||||||
|
|
||||||
new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY);
|
|
||||||
if (IS_ERR(new_file)) {
|
|
||||||
error = PTR_ERR(new_file);
|
|
||||||
goto out_fput;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* FIXME: copy up sparse files efficiently */
|
|
||||||
while (len) {
|
|
||||||
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
|
|
||||||
long bytes;
|
|
||||||
|
|
||||||
if (len < this_len)
|
|
||||||
this_len = len;
|
|
||||||
|
|
||||||
if (signal_pending_state(TASK_KILLABLE, current)) {
|
|
||||||
error = -EINTR;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
bytes = do_splice_direct(old_file, &old_pos,
|
|
||||||
new_file, &new_pos,
|
|
||||||
this_len, SPLICE_F_MOVE);
|
|
||||||
if (bytes <= 0) {
|
|
||||||
error = bytes;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
WARN_ON(old_pos != new_pos);
|
|
||||||
|
|
||||||
len -= bytes;
|
|
||||||
}
|
|
||||||
|
|
||||||
fput(new_file);
|
|
||||||
out_fput:
|
|
||||||
fput(old_file);
|
|
||||||
return error;
|
|
||||||
}
|
|
||||||
|
|
||||||
static char *ovl_read_symlink(struct dentry *realdentry)
|
|
||||||
{
|
|
||||||
int res;
|
|
||||||
char *buf;
|
|
||||||
struct inode *inode = realdentry->d_inode;
|
|
||||||
mm_segment_t old_fs;
|
|
||||||
|
|
||||||
res = -EINVAL;
|
|
||||||
if (!inode->i_op->readlink)
|
|
||||||
goto err;
|
|
||||||
|
|
||||||
res = -ENOMEM;
|
|
||||||
buf = (char *) __get_free_page(GFP_KERNEL);
|
|
||||||
if (!buf)
|
|
||||||
goto err;
|
|
||||||
|
|
||||||
old_fs = get_fs();
|
|
||||||
set_fs(get_ds());
|
|
||||||
/* The cast to a user pointer is valid due to the set_fs() */
|
|
||||||
res = inode->i_op->readlink(realdentry,
|
|
||||||
(char __user *)buf, PAGE_SIZE - 1);
|
|
||||||
set_fs(old_fs);
|
|
||||||
if (res < 0) {
|
|
||||||
free_page((unsigned long) buf);
|
|
||||||
goto err;
|
|
||||||
}
|
|
||||||
buf[res] = '\0';
|
|
||||||
|
|
||||||
return buf;
|
|
||||||
|
|
||||||
err:
|
|
||||||
return ERR_PTR(res);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
|
|
||||||
{
|
|
||||||
struct iattr attr = {
|
|
||||||
.ia_valid =
|
|
||||||
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
|
|
||||||
.ia_atime = stat->atime,
|
|
||||||
.ia_mtime = stat->mtime,
|
|
||||||
};
|
|
||||||
|
|
||||||
return notify_change(upperdentry, &attr, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
|
|
||||||
{
|
|
||||||
int err = 0;
|
|
||||||
|
|
||||||
if (!S_ISLNK(stat->mode)) {
|
|
||||||
struct iattr attr = {
|
|
||||||
.ia_valid = ATTR_MODE,
|
|
||||||
.ia_mode = stat->mode,
|
|
||||||
};
|
|
||||||
err = notify_change(upperdentry, &attr, NULL);
|
|
||||||
}
|
|
||||||
if (!err) {
|
|
||||||
struct iattr attr = {
|
|
||||||
.ia_valid = ATTR_UID | ATTR_GID,
|
|
||||||
.ia_uid = stat->uid,
|
|
||||||
.ia_gid = stat->gid,
|
|
||||||
};
|
|
||||||
err = notify_change(upperdentry, &attr, NULL);
|
|
||||||
}
|
|
||||||
if (!err)
|
|
||||||
ovl_set_timestamps(upperdentry, stat);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
|
|
||||||
struct dentry *dentry, struct path *lowerpath,
|
|
||||||
struct kstat *stat, const char *link)
|
|
||||||
{
|
|
||||||
struct inode *wdir = workdir->d_inode;
|
|
||||||
struct inode *udir = upperdir->d_inode;
|
|
||||||
struct dentry *newdentry = NULL;
|
|
||||||
struct dentry *upper = NULL;
|
|
||||||
umode_t mode = stat->mode;
|
|
||||||
unsigned opt = ovl_get_config_opt(dentry);
|
|
||||||
int err;
|
|
||||||
|
|
||||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
|
||||||
err = PTR_ERR(newdentry);
|
|
||||||
if (IS_ERR(newdentry))
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
|
||||||
dentry->d_name.len);
|
|
||||||
err = PTR_ERR(upper);
|
|
||||||
if (IS_ERR(upper))
|
|
||||||
goto out1;
|
|
||||||
|
|
||||||
/* Can't properly set mode on creation because of the umask */
|
|
||||||
stat->mode &= S_IFMT;
|
|
||||||
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
|
|
||||||
stat->mode = mode;
|
|
||||||
if (err)
|
|
||||||
goto out2;
|
|
||||||
|
|
||||||
if (S_ISREG(stat->mode)) {
|
|
||||||
struct path upperpath;
|
|
||||||
|
|
||||||
ovl_path_upper(dentry, &upperpath);
|
|
||||||
BUG_ON(upperpath.dentry != NULL);
|
|
||||||
upperpath.dentry = newdentry;
|
|
||||||
|
|
||||||
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ovl_copy_xattr(lowerpath->dentry, newdentry, opt);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
inode_lock(newdentry->d_inode);
|
|
||||||
err = ovl_set_attr(newdentry, stat);
|
|
||||||
inode_unlock(newdentry->d_inode);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
|
||||||
if (err)
|
|
||||||
goto out_cleanup;
|
|
||||||
|
|
||||||
ovl_dentry_update(dentry, newdentry);
|
|
||||||
newdentry = NULL;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Non-directores become opaque when copied up.
|
|
||||||
*/
|
|
||||||
if (!S_ISDIR(stat->mode))
|
|
||||||
ovl_dentry_set_opaque(dentry, true);
|
|
||||||
out2:
|
|
||||||
dput(upper);
|
|
||||||
out1:
|
|
||||||
dput(newdentry);
|
|
||||||
out:
|
|
||||||
return err;
|
|
||||||
|
|
||||||
out_cleanup:
|
|
||||||
ovl_cleanup(wdir, newdentry);
|
|
||||||
goto out2;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Copy up a single dentry
|
|
||||||
*
|
|
||||||
* Directory renames only allowed on "pure upper" (already created on
|
|
||||||
* upper filesystem, never copied up). Directories which are on lower or
|
|
||||||
* are merged may not be renamed. For these -EXDEV is returned and
|
|
||||||
* userspace has to deal with it. This means, when copying up a
|
|
||||||
* directory we can rely on it and ancestors being stable.
|
|
||||||
*
|
|
||||||
* Non-directory renames start with copy up of source if necessary. The
|
|
||||||
* actual rename will only proceed once the copy up was successful. Copy
|
|
||||||
* up uses upper parent i_mutex for exclusion. Since rename can change
|
|
||||||
* d_parent it is possible that the copy up will lock the old parent. At
|
|
||||||
* that point the file will have already been copied up anyway.
|
|
||||||
*/
|
|
||||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
|
||||||
struct path *lowerpath, struct kstat *stat)
|
|
||||||
{
|
|
||||||
struct dentry *workdir = ovl_workdir(dentry);
|
|
||||||
int err;
|
|
||||||
struct kstat pstat;
|
|
||||||
struct path parentpath;
|
|
||||||
struct dentry *upperdir;
|
|
||||||
struct dentry *upperdentry;
|
|
||||||
const struct cred *old_cred;
|
|
||||||
struct cred *override_cred;
|
|
||||||
char *link = NULL;
|
|
||||||
|
|
||||||
if (WARN_ON(!workdir))
|
|
||||||
return -EROFS;
|
|
||||||
|
|
||||||
ovl_do_check_copy_up(lowerpath->dentry);
|
|
||||||
|
|
||||||
ovl_path_upper(parent, &parentpath);
|
|
||||||
upperdir = parentpath.dentry;
|
|
||||||
|
|
||||||
err = vfs_getattr(&parentpath, &pstat);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
if (S_ISLNK(stat->mode)) {
|
|
||||||
link = ovl_read_symlink(lowerpath->dentry);
|
|
||||||
if (IS_ERR(link))
|
|
||||||
return PTR_ERR(link);
|
|
||||||
}
|
|
||||||
|
|
||||||
err = -ENOMEM;
|
|
||||||
override_cred = prepare_creds();
|
|
||||||
if (!override_cred)
|
|
||||||
goto out_free_link;
|
|
||||||
|
|
||||||
override_cred->fsuid = stat->uid;
|
|
||||||
override_cred->fsgid = stat->gid;
|
|
||||||
/*
|
|
||||||
* CAP_SYS_ADMIN for copying up extended attributes
|
|
||||||
* CAP_DAC_OVERRIDE for create
|
|
||||||
* CAP_FOWNER for chmod, timestamp update
|
|
||||||
* CAP_FSETID for chmod
|
|
||||||
* CAP_CHOWN for chown
|
|
||||||
* CAP_MKNOD for mknod
|
|
||||||
*/
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
|
||||||
cap_raise(override_cred->cap_effective, CAP_MKNOD);
|
|
||||||
old_cred = override_creds(override_cred);
|
|
||||||
|
|
||||||
err = -EIO;
|
|
||||||
if (lock_rename(workdir, upperdir) != NULL) {
|
|
||||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
|
||||||
goto out_unlock;
|
|
||||||
}
|
|
||||||
upperdentry = ovl_dentry_upper(dentry);
|
|
||||||
if (upperdentry) {
|
|
||||||
/* Raced with another copy-up? Nothing to do, then... */
|
|
||||||
err = 0;
|
|
||||||
goto out_unlock;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
|
|
||||||
stat, link);
|
|
||||||
if (!err) {
|
|
||||||
/* Restore timestamps on parent (best effort) */
|
|
||||||
ovl_set_timestamps(upperdir, &pstat);
|
|
||||||
}
|
|
||||||
out_unlock:
|
|
||||||
unlock_rename(workdir, upperdir);
|
|
||||||
revert_creds(old_cred);
|
|
||||||
put_cred(override_cred);
|
|
||||||
|
|
||||||
out_free_link:
|
|
||||||
if (link)
|
|
||||||
free_page((unsigned long) link);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ovl_copy_up(struct dentry *dentry)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
err = 0;
|
|
||||||
while (!err) {
|
|
||||||
struct dentry *next;
|
|
||||||
struct dentry *parent;
|
|
||||||
struct path lowerpath;
|
|
||||||
struct kstat stat;
|
|
||||||
enum ovl_path_type type = ovl_path_type(dentry);
|
|
||||||
|
|
||||||
if (OVL_TYPE_UPPER(type))
|
|
||||||
break;
|
|
||||||
|
|
||||||
next = dget(dentry);
|
|
||||||
/* find the topmost dentry not yet copied up */
|
|
||||||
for (;;) {
|
|
||||||
parent = dget_parent(next);
|
|
||||||
|
|
||||||
type = ovl_path_type(parent);
|
|
||||||
if (OVL_TYPE_UPPER(type))
|
|
||||||
break;
|
|
||||||
|
|
||||||
dput(next);
|
|
||||||
next = parent;
|
|
||||||
}
|
|
||||||
|
|
||||||
ovl_path_lower(next, &lowerpath);
|
|
||||||
err = vfs_getattr(&lowerpath, &stat);
|
|
||||||
if (!err)
|
|
||||||
err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
|
|
||||||
|
|
||||||
dput(parent);
|
|
||||||
dput(next);
|
|
||||||
}
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user