From 68e3473d655a74d79e4a4f8710ba5cbe4f614f07 Mon Sep 17 00:00:00 2001 From: Hajime Tazaki Date: Wed, 7 Jun 2023 15:12:56 +0900 Subject: [PATCH 1/5] lkl: fix hijack epoll handling When epoll_create of host syscall fails, it returns a negative integer, not zero. Thus the condition of if statement was wrong. This commit fixes this issue. Signed-off-by: Hajime Tazaki --- tools/lkl/lib/hijack/hijack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lkl/lib/hijack/hijack.c b/tools/lkl/lib/hijack/hijack.c index 2ba5c507b11ef8..fd8bf59973d422 100644 --- a/tools/lkl/lib/hijack/hijack.c +++ b/tools/lkl/lib/hijack/hijack.c @@ -353,7 +353,7 @@ int epoll_create1(int flags) CHECK_HOST_CALL(epoll_create1); host_fd = host_epoll_create1(flags); - if (!host_fd) { + if (host_fd < 0) { fprintf(stderr, "%s fail (%d)\n", __func__, errno); return -1; } From 754d8ee1ea54da9e9103a2d03d839c10d81b503e Mon Sep 17 00:00:00 2001 From: Hajime Tazaki Date: Wed, 31 May 2023 19:11:01 +0900 Subject: [PATCH 2/5] lkl: fix syscall tls handling issue when used w/ dlmopen This is problematic when an application uses TLS on thread 0, and LKL is loaded via dlmopen. when lkl_syscall is called, it tries to get pthread_key but there are existing key/data, then lkl_syscall uses the existing one and it's not task_struct so, crashed. The root cause is that __pthread_keys is a global symbol and isolated via namespace created by dlmopen, while pthread_getspecific looks at the storage of the thread, which are not isolated via dlmopen. There are a discussion which I found. - pthread_key_create, pthread_setspecific are incompatible with dlmopen https://sourceware.org/bugzilla/show_bug.cgi?id=24776 A work around is to avoid using pthread_key API when LKL is loaded via dlmopen(3) and replace TLS function upon the initialization. We only fixed for posix host environment as dlmopen(3) is only usable on Linux implementation (AFAIK). Signed-off-by: Hajime Tazaki --- tools/lkl/Targets | 2 +- tools/lkl/include/lkl_host.h | 1 + tools/lkl/lib/posix-host.c | 108 ++++++++++++++++++++++++++++++++--- 3 files changed, 102 insertions(+), 9 deletions(-) diff --git a/tools/lkl/Targets b/tools/lkl/Targets index 7da425e774da91..91795aa9d08f02 100644 --- a/tools/lkl/Targets +++ b/tools/lkl/Targets @@ -6,7 +6,7 @@ endif LDFLAGS_lib/hijack/liblkl-hijack-y += -shared -nodefaultlibs LDLIBS_lib/hijack/liblkl-hijack-y += -ldl LDLIBS_lib/hijack/liblkl-hijack-$(LKL_HOST_CONFIG_ARM) += -lgcc -lc -LDLIBS_lib/hijack/liblkl-hijack-$(LKL_HOST_CONFIG_AARCH64) += -lc +LDLIBS_lib/hijack/liblkl-hijack-$(LKL_HOST_CONFIG_AARCH64) += -lgcc -lc LDLIBS_lib/hijack/liblkl-hijack-$(LKL_HOST_CONFIG_I386) += -lc_nonshared progs-$(LKL_HOST_CONFIG_FUSE) += lklfuse diff --git a/tools/lkl/include/lkl_host.h b/tools/lkl/include/lkl_host.h index 62e3c48fbd4e9f..05cd459a841fe3 100644 --- a/tools/lkl/include/lkl_host.h +++ b/tools/lkl/include/lkl_host.h @@ -9,6 +9,7 @@ extern "C" { #include extern struct lkl_host_operations lkl_host_ops; +extern void lkl_change_tls_mode(void); /** * lkl_printf - print a message via the host print operation diff --git a/tools/lkl/lib/posix-host.c b/tools/lkl/lib/posix-host.c index b5f71e2b512739..a667de02efa123 100644 --- a/tools/lkl/lib/posix-host.c +++ b/tools/lkl/lib/posix-host.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "iomem.h" #include "jmp_buf.h" @@ -247,7 +248,7 @@ void *thread_stack(unsigned long *size) return thread_stack; } -static struct lkl_tls_key *tls_alloc(void (*destructor)(void *)) +static struct lkl_tls_key *tsd_alloc(void (*destructor)(void *)) { struct lkl_tls_key *ret = malloc(sizeof(struct lkl_tls_key)); @@ -258,24 +259,107 @@ static struct lkl_tls_key *tls_alloc(void (*destructor)(void *)) return ret; } -static void tls_free(struct lkl_tls_key *key) +static void tsd_free(struct lkl_tls_key *key) { WARN_PTHREAD(pthread_key_delete(key->key)); free(key); } -static int tls_set(struct lkl_tls_key *key, void *data) +static int tsd_set(struct lkl_tls_key *key, void *data) { if (WARN_PTHREAD(pthread_setspecific(key->key, data))) return -1; return 0; } -static void *tls_get(struct lkl_tls_key *key) +static void *tsd_get(struct lkl_tls_key *key) { return pthread_getspecific(key->key); } +/** + * when LKL is loaded via dl*m*open(3), the pthread_getspecific() + * doesn't work correctly, as a global symbol, __pthread_keys, is + * duplicated across multiple namespaces and conflicts the same keys + * in multiple users of TSD in a single process, which makes our case + * impossible to work (e.g., host_task from each thread). + * + * To work around this issue, we use TLS, using __thread which doesn't + * require any conflict global symbols. but the default __thread uses + * __tls_get_addr() of glibc function, calling futex, and making a + * dead-lock in our thread. So explicitly initialize with + * initial-exec is needed. + * + * We'll still use the previous *TSD* (thread specific data) + * implementation based on pthread_key_create, as the most of the + * cases, don't hit this situation, as dlmopen is not a common + * practice and a few implementation (i.e., like glibc) has this + * function. + * + */ +#define LKL_MAX_TLS_KEYS (PTHREAD_KEYS_MAX/8) /* 1024/8 = 128 */ +struct __lkl_tls_keys { + int used; + void *data; +}; +static __thread struct __lkl_tls_keys __tls_keys[LKL_MAX_TLS_KEYS]; + +static struct lkl_tls_key *tls_alloc(void (*destructor)(void *)) +{ + int idx; + struct lkl_tls_key *ret = malloc(sizeof(struct lkl_tls_key)); + + for (idx = 0; idx < LKL_MAX_TLS_KEYS; idx++) { + /* data = NULL means the key unused */ + if (__tls_keys[idx].used == 0) { + ret->key = (pthread_key_t)idx; + __tls_keys[idx].used = 1; + return ret; + } + } + + /* if there are no unused keys, return NULL */ + free(ret); + return NULL; +} + +static void tls_free(struct lkl_tls_key *key) +{ + int idx = (int)key->key; + + if (idx < 0 || idx >= LKL_MAX_TLS_KEYS) { + lkl_printf("%s; key not found\n", __func__); + return; + } + __tls_keys[idx].used = 0; + free(key); +} + +static int tls_set(struct lkl_tls_key *key, void *data) +{ + int idx = (int)key->key; + + if (idx < 0 || idx >= LKL_MAX_TLS_KEYS) { + lkl_printf("%s; key not found\n", __func__); + return -1; + } + __tls_keys[idx].data = data; + return 0; +} + +static void *tls_get(struct lkl_tls_key *key) +{ + int idx = (int)key->key; + + if (idx < 0 || idx >= LKL_MAX_TLS_KEYS) { + lkl_printf("%s; key not found\n", __func__); + return NULL; + } + + return __tls_keys[idx].data; +} + + static unsigned long long time_ns(void) { struct timespec ts; @@ -423,10 +507,10 @@ struct lkl_host_operations lkl_host_ops = { .mutex_free = mutex_free, .mutex_lock = mutex_lock, .mutex_unlock = mutex_unlock, - .tls_alloc = tls_alloc, - .tls_free = tls_free, - .tls_set = tls_set, - .tls_get = tls_get, + .tls_alloc = tsd_alloc, + .tls_free = tsd_free, + .tls_set = tsd_set, + .tls_get = tsd_get, .time = time_ns, .timer_alloc = timer_alloc, .timer_set_oneshot = timer_set_oneshot, @@ -451,6 +535,14 @@ struct lkl_host_operations lkl_host_ops = { #endif }; +void lkl_change_tls_mode(void) +{ + lkl_host_ops.tls_alloc = tls_alloc; + lkl_host_ops.tls_free = tls_free; + lkl_host_ops.tls_set = tls_set; + lkl_host_ops.tls_get = tls_get; +} + static int fd_get_capacity(struct lkl_disk disk, unsigned long long *res) { off_t off; From d307e6d8abb771ce564ed45c5bf2a9da04f08449 Mon Sep 17 00:00:00 2001 From: Hajime Tazaki Date: Tue, 6 Jun 2023 09:02:52 +0900 Subject: [PATCH 3/5] lkl: add a dlmopen testcase This commit add a testcase when LKL is loaded via dlmopen(3) function call. In this situation, the thread specific data which LKL uses behaves wrong and cause an unconditional memory access. This test should detect this case. note 230614: the test hungs at this moment on the multiple pthread_exit(), which is due to a lock handling on load of libgcc_s.so. Still investigating what's going on. If we replace dlmopen with dlopen, everything goes fine.. Signed-off-by: Hajime Tazaki --- tools/lkl/.gitignore | 1 + tools/lkl/Targets | 4 + tools/lkl/tests/Build | 1 + tools/lkl/tests/test-dlmopen.c | 134 +++++++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+) create mode 100644 tools/lkl/tests/test-dlmopen.c diff --git a/tools/lkl/.gitignore b/tools/lkl/.gitignore index 1a8ee8acbc4e3f..2a5f4273b0b079 100644 --- a/tools/lkl/.gitignore +++ b/tools/lkl/.gitignore @@ -10,6 +10,7 @@ tests/net-test tests/disk tests/disk-vfio-pci tests/config +tests/test-dlmopen Makefile.conf include/lkl_autoconf.h include/kernel_config.h diff --git a/tools/lkl/Targets b/tools/lkl/Targets index 91795aa9d08f02..54b8c2334b570b 100644 --- a/tools/lkl/Targets +++ b/tools/lkl/Targets @@ -26,6 +26,10 @@ progs-y += tests/disk progs-y += tests/disk-vfio-pci progs-y += tests/net-test progs-y += tests/config +ifneq ($(LKL_HOST_CONFIG_BSD),y) +progs-y += tests/test-dlmopen +LDLIBS_tests/test-dlmopen-$(LKL_HOST_CONFIG_POSIX) += -ldl +endif # LKL fuzzers fuzzers-y += fuzzers/hid/hid-fuzzer diff --git a/tools/lkl/tests/Build b/tools/lkl/tests/Build index ed943821d72e4d..b156f780ae402e 100644 --- a/tools/lkl/tests/Build +++ b/tools/lkl/tests/Build @@ -3,3 +3,4 @@ disk-y += disk.o cla.o test.o disk-vfio-pci-y += disk-vfio-pci.o cla.o test.o net-test-y += net-test.o cla.o test.o config-y += config.o test.o +test-dlmopen-y += test-dlmopen.o test.o diff --git a/tools/lkl/tests/test-dlmopen.c b/tools/lkl/tests/test-dlmopen.c new file mode 100644 index 00000000000000..292eec7094a832 --- /dev/null +++ b/tools/lkl/tests/test-dlmopen.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* if dlemopen is not implemented, skip test */ +#if defined(__x86_64__) && defined(__linux__) +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include "test.h" + +#define CMD_LINE "mem=16M loglevel=8" + +/* glibc (may) only supports dlmopen(3) */ +#ifndef LM_ID_NEWLM +#define NO_DLMOPEN_LINUX 1 +#else +#define NO_DLMOPEN_LINUX 0 +#endif + +static int lkl_test_dlmopen(void) +{ + void *handle; + long ret; + char *filename = "liblkl.so"; + long params[6] = {0}; + int (*fn_init)(struct lkl_host_operations *ops); + int (*fn_start)(char *fmt, ...); + long (*fn_syscall)(long no, long *params); + long (*fn_sys_halt)(void); + void (*fn_tls_mode)(void); + struct lkl_host_operations *lkl_host_ops; + + handle = dlmopen(LM_ID_NEWLM, filename, RTLD_NOW | RTLD_LOCAL); + if (!handle) { + lkl_test_logf("%s: dlmopen failed, %s\n", __func__, dlerror()); + return TEST_FAILURE; + } + + fn_init = dlsym(handle, "lkl_init"); + if (!fn_init) { + lkl_test_logf("%s: dlsym failed, %s\n", __func__, dlerror()); + return TEST_FAILURE; + } + + lkl_host_ops = dlsym(handle, "lkl_host_ops"); + if (!lkl_host_ops) { + lkl_test_logf("%s: dlsym failed, %s\n", __func__, dlerror()); + return TEST_FAILURE; + } + + fn_start = dlsym(handle, "lkl_start_kernel"); + if (!fn_start) { + lkl_test_logf("%s: dlsym failed, %s\n", __func__, dlerror()); + return TEST_FAILURE; + } + + fn_syscall = dlsym(handle, "lkl_syscall"); + if (!fn_syscall) { + lkl_test_logf("%s: dlsym failed, %s\n", __func__, dlerror()); + return TEST_FAILURE; + } + + fn_sys_halt = dlsym(handle, "lkl_sys_halt"); + if (!fn_sys_halt) { + lkl_test_logf("%s: dlsym failed, %s\n", __func__, dlerror()); + return TEST_FAILURE; + } + + fn_tls_mode = dlsym(handle, "lkl_change_tls_mode"); + if (!fn_tls_mode) { + lkl_test_logf("%s: dlsym failed, %s\n", __func__, dlerror()); + return TEST_FAILURE; + } + + /* start calling resolved symbols */ + fn_tls_mode(); + fn_init(lkl_host_ops); + ret = fn_start(CMD_LINE); + if (ret != 0) { + lkl_test_logf("lkl_start_kernel() = %ld %s\n", + ret, ret < 0 ? lkl_strerror(ret) : ""); + return TEST_FAILURE; + } + + ret = fn_syscall(__lkl__NR_getpid, params); + lkl_test_logf("getpid() = %ld\n", ret); + if (ret != 1) { + lkl_test_logf("getpid() = %ld %s\n", ret, ret < 0 ? lkl_strerror(ret) : ""); + return TEST_FAILURE; + } + + ret = fn_sys_halt(); + if (ret != 0) { + lkl_test_logf("halt() = %ld %s\n", ret, ret < 0 ? lkl_strerror(ret) : ""); + return TEST_FAILURE; + } + + return ret == 0 ? TEST_SUCCESS : TEST_FAILURE; +} + +struct lkl_test tests[] = { + LKL_TEST(dlmopen), +}; + +int main(int argc, const char **argv) +{ + int ret; + + if (NO_DLMOPEN_LINUX) { + lkl_test_logf("no dlmopen support\n"); + return TEST_SKIP; + } + + ret = lkl_test_run(tests, sizeof(tests)/sizeof(struct lkl_test), + "dlmopen"); + + lkl_cleanup(); + + return ret; +} + +#else +#include "test.h" + +int main(int argc, const char **argv) +{ + lkl_test_logf("no x86_64 arch supported\n"); + return TEST_SKIP; +} +#endif /* defined (__x86_64__) && defined (__linux__) */ From a3be0d4112d7051496037a6437af4ae0455e39dd Mon Sep 17 00:00:00 2001 From: Hajime Tazaki Date: Wed, 28 Jun 2023 10:55:23 +0900 Subject: [PATCH 4/5] lkl: ignore PREFER_DEFINED_ATTRIBUTE_MACRO warnings from checkpatch This commit add an exception to the checkpatch.pl to silence warnings of compiler attributes styles (e.g., __unused, __alias, etc), as the userspace code used in LKL is hard to include headers from include/linux/compiler.h and tools/include/linux/compiler.h. Signed-off-by: Hajime Tazaki --- tools/lkl/scripts/checkpatch.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/lkl/scripts/checkpatch.sh b/tools/lkl/scripts/checkpatch.sh index e4d8316eb36097..b1c86123db99d4 100755 --- a/tools/lkl/scripts/checkpatch.sh +++ b/tools/lkl/scripts/checkpatch.sh @@ -52,7 +52,8 @@ if [ -z "$c" ]; then exit 0 fi -./scripts/checkpatch.pl $Q --summary-file --ignore FILE_PATH_CHANGES $tmp/*.patch +./scripts/checkpatch.pl $Q --summary-file --ignore FILE_PATH_CHANGES \ + --ignore PREFER_DEFINED_ATTRIBUTE_MACRO $tmp/*.patch rm $tmp/*.patch # checkpatch.pl does not know how to deal with 3 way diffs which would From 47af656b9b4885d70d3e56990fb2f6be28579f6d Mon Sep 17 00:00:00 2001 From: Hajime Tazaki Date: Wed, 26 Jul 2023 23:14:41 +0900 Subject: [PATCH 5/5] lkl: integrate with zpoline for alternate hijack backend This commit introduces an integration with zpoline (*1), which is a mechanism to rewrite binary upon loading. zpoline allows us to replace symbols of system call to different functions, which current LKL hijack library does the same thing in a different way, by using symbol replacement via LD_PRELOAD. The benefit of zpoline is that the replacement of syscall is at the instruction of `syscall` or `sysenter`, which userspace program can catch all syscalls, while the approach based on LD_PRELOAD cannot when the symbols of interet are hidden within libc (e.g., __socket). For more detail about the internal of zpoline, take a look at *1. *1: https://github.com/yasukata/zpoline Signed-off-by: Hajime Tazaki --- .github/workflows/ci.yml | 20 ++++ Documentation/lkl.txt | 65 +++++++++++++ tools/lkl/Makefile.autoconf | 6 ++ tools/lkl/Targets | 4 + tools/lkl/bin/lkl-hijack.sh | 11 ++- tools/lkl/lib/hijack/Build | 5 + tools/lkl/lib/hijack/hijack.c | 29 +++--- tools/lkl/lib/hijack/hijack.h | 21 +++++ tools/lkl/lib/hijack/init.c | 26 +++++- tools/lkl/lib/hijack/init.h | 3 + tools/lkl/lib/hijack/preload.c | 68 ++++++++++++++ tools/lkl/lib/hijack/zpoline.c | 165 +++++++++++++++++++++++++++++++++ tools/lkl/tests/hijack-test.sh | 15 ++- tools/lkl/tests/run.py | 1 + 14 files changed, 417 insertions(+), 22 deletions(-) create mode 100644 tools/lkl/lib/hijack/hijack.h create mode 100644 tools/lkl/lib/hijack/preload.c create mode 100644 tools/lkl/lib/hijack/zpoline.c diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f7466f805c7414..9de785679de8f4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,6 +36,12 @@ jobs: runs_on: ubuntu-22.04 shell: bash build_options: "LKL_FUZZING=1 fuzzers" + - displayTargetName: zpoline + # maybe integrate with default Linux build once the function becomes stable + os: unix + runs_on: ubuntu-22.04 + shell: bash + build_options: "zpoline=./zpoline" timeout-minutes: 100 env: CCACHE_DIR: ${{ github.workspace }}/.ccache @@ -128,6 +134,20 @@ jobs: which gcc ccache -z + - name: install zpoline + if: matrix.displayTargetName == 'zpoline' + run: | + sudo apt install -y binutils-dev + git clone https://github.com/yasukata/zpoline + cd zpoline + git checkout 022a3b8c7a5c23bfd99162b478bf3eb5f70c07a2 + make + cd .. + # This is the whole point of zpoline + echo "==== setting mmap_min_addr ====" + sudo sh -c "echo 0 > /proc/sys/vm/mmap_min_addr" + echo "setting env variable (debug)" + echo "ZPOLINE_DEBUG=0" >> "$GITHUB_ENV" - name: Build run: | make -j4 -C tools/lkl ${{ matrix.build_options }} diff --git a/Documentation/lkl.txt b/Documentation/lkl.txt index e480d64dd5966a..42db98c51506c8 100644 --- a/Documentation/lkl.txt +++ b/Documentation/lkl.txt @@ -460,6 +460,71 @@ The following are the list of keys to describe a JSON file. "nameserver":"8.8.8.8" ``` +LKL hijack library with zpoline +------------------------------- + +[zpoline](https://github.com/yasukata/zpoline) is an alternative to +syscall hijack based on LD_PRELOAD, which is still default on LKL. +The zpoline library works with binary rewrites to the loaded programs +upon instantiation, then load hook function for the original syscalls. +The LKL hijack library works together with zpoline by loading LKL. + +zpoline currently only works on x86_64 machines. + +To use the zpoline-enabled hijack library, please follow the +instruction below. + +- Build +``` +make -C tools/lkl -j8 zpoline=../zpoline +``` + +Suppose `zpoline` is downloaded at `../zpoline` and already build +before LKL build. + +- Execution + +zpoline rewrites the memory address 0x0 to hook syscalls, but non-root +users don't have a privilege to operate that address. The following +configuration allows us to use zpoline without root privilege. + +``` +sudo sh -c "echo 0 > /proc/sys/vm/mmap_min_addr" +``` + +then, execute command with the environment variable `LKL_HIJACK_ZPOLINE=1`. + +``` +LKL_HIJACK_ZPOLINE=1 LKL_HIJACK_CONFIG_FILE=lkl-tap.json \ + ./tools/lkl/bin/lkl-hijack.sh ping www.google.com +``` + +The file `lkl-tap.json` can be prepared like this. + +``` +{ + "gateway": "172.17.0.1", + "nameserver": "8.8.8.8", + "interfaces": [ + { + "ip": "172.17.0.39", + "masklen": "16", + "mac": "00:0d:0b:94:4e:97", + "param": "tap0", + "type": "tap" + } + ], +} +``` + +With the preload hijack library, which is the default one, it uses the +host name resolver and if the host uses a nameserver, defined at +`/etc/resolv.conf`, like 127.0.0.53, is not accepting DNS requests, in +a view of the LKL instance. + +But with zpoline, it can successfully replace all syscalls for name +resolution so can `ping` with a name. + FAQ === diff --git a/tools/lkl/Makefile.autoconf b/tools/lkl/Makefile.autoconf index 3c0d07ac6835d6..a15753193a4f6e 100644 --- a/tools/lkl/Makefile.autoconf +++ b/tools/lkl/Makefile.autoconf @@ -62,6 +62,11 @@ define virtio_net_vde LDLIBS += $(shell pkg-config --libs vdeplug) endef +define zpoline_conf + $(eval zpoline_dir=$(abspath $(srctree)/$(1))) + $(if $(strip $(foreach f, $(zpoline_dir), $(wildcard $(f)/libzpoline.so))),$(call set_autoconf_var,ZPOLINE_DIR,$(zpoline_dir))) +endef + define posix_host $(call set_autoconf_var,POSIX,y) $(call set_autoconf_var,VIRTIO_NET,y) @@ -82,6 +87,7 @@ define posix_host $(if $(filter $(1),elf64-x86-64-freebsd),$(call set_autoconf_var,NEEDS_LARGP,y)) $(if $(filter $(1),elf32-i386),$(call set_autoconf_var,I386,y)) $(if $(strip $(call find_include,jsmn.h)),$(call set_autoconf_var,JSMN,y)) + $(if $(filter %,$(zpoline)),$(call zpoline_conf,$(zpoline))) endef define nt64_host diff --git a/tools/lkl/Targets b/tools/lkl/Targets index 54b8c2334b570b..8f1b66c5073906 100644 --- a/tools/lkl/Targets +++ b/tools/lkl/Targets @@ -2,6 +2,7 @@ libs-y += lib/liblkl ifneq ($(LKL_HOST_CONFIG_BSD),y) libs-$(LKL_HOST_CONFIG_POSIX) += lib/hijack/liblkl-hijack +libs-$(LKL_HOST_CONFIG_POSIX) += lib/hijack/liblkl-zpoline endif LDFLAGS_lib/hijack/liblkl-hijack-y += -shared -nodefaultlibs LDLIBS_lib/hijack/liblkl-hijack-y += -ldl @@ -9,6 +10,9 @@ LDLIBS_lib/hijack/liblkl-hijack-$(LKL_HOST_CONFIG_ARM) += -lgcc -lc LDLIBS_lib/hijack/liblkl-hijack-$(LKL_HOST_CONFIG_AARCH64) += -lgcc -lc LDLIBS_lib/hijack/liblkl-hijack-$(LKL_HOST_CONFIG_I386) += -lc_nonshared +LDFLAGS_lib/hijack/liblkl-zpoline-$(LKL_HOST_CONFIG_POSIX) += -shared -nodefaultlibs +LDLIBS_lib/hijack/liblkl-zpoline-$(LKL_HOST_CONFIG_POSIX) += -ldl -lc + progs-$(LKL_HOST_CONFIG_FUSE) += lklfuse LDLIBS_lklfuse-y := -lfuse diff --git a/tools/lkl/bin/lkl-hijack.sh b/tools/lkl/bin/lkl-hijack.sh index e4f1e0c47b6995..260251bcc5ecc4 100755 --- a/tools/lkl/bin/lkl-hijack.sh +++ b/tools/lkl/bin/lkl-hijack.sh @@ -13,10 +13,19 @@ ## script_dir=$(cd $(dirname ${BASH_SOURCE:-$0}); pwd) +. ${script_dir}/../tests/autoconf.sh export LD_LIBRARY_PATH=${script_dir}/../lib/hijack if [ -n ${LKL_HIJACK_DEBUG+x} ] then trap '' TSTP fi -LD_PRELOAD=liblkl-hijack.so $* + + +if [ -n "${LKL_HIJACK_ZPOLINE}" ] +then + export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${LKL_HOST_CONFIG_ZPOLINE_DIR} + LD_PRELOAD=libzpoline.so LIBZPHOOK=liblkl-zpoline.so $* +else + LD_PRELOAD=liblkl-hijack.so $* +fi diff --git a/tools/lkl/lib/hijack/Build b/tools/lkl/lib/hijack/Build index e68e93a3328ac6..0c807ef2e2b5be 100644 --- a/tools/lkl/lib/hijack/Build +++ b/tools/lkl/lib/hijack/Build @@ -1,4 +1,9 @@ +liblkl-hijack-y += preload.o liblkl-hijack-y += hijack.o liblkl-hijack-y += init.o liblkl-hijack-y += xlate.o +liblkl-zpoline-y += zpoline.o +liblkl-zpoline-y += hijack.o +liblkl-zpoline-y += init.o +liblkl-zpoline-y += xlate.o diff --git a/tools/lkl/lib/hijack/hijack.c b/tools/lkl/lib/hijack/hijack.c index fd8bf59973d422..0dfe08aacb08d6 100644 --- a/tools/lkl/lib/hijack/hijack.c +++ b/tools/lkl/lib/hijack/hijack.c @@ -31,8 +31,9 @@ #include "xlate.h" #include "init.h" +#include "hijack.h" -static int is_lklfd(int fd) +int is_lklfd(int fd) { if (fd < LKL_FD_OFFSET) return 0; @@ -167,8 +168,8 @@ HOST_CALL(write) HOST_CALL(pipe2) HOST_CALL(setsockopt); -int setsockopt(int fd, int level, int optname, const void *optval, - socklen_t optlen) +int hijack_setsockopt(int fd, int level, int optname, const void *optval, + socklen_t optlen) { CHECK_HOST_CALL(setsockopt); if (!is_lklfd(fd)) @@ -178,7 +179,7 @@ int setsockopt(int fd, int level, int optname, const void *optval, } HOST_CALL(getsockopt); -int getsockopt(int fd, int level, int optname, void *optval, socklen_t *optlen) +int hijack_getsockopt(int fd, int level, int optname, void *optval, socklen_t *optlen) { CHECK_HOST_CALL(getsockopt); if (!is_lklfd(fd)) @@ -240,7 +241,7 @@ int fcntl(int fd, int cmd, ...) } HOST_CALL(poll); -int poll(struct pollfd *fds, nfds_t nfds, int timeout) +int hijack_poll(struct pollfd *fds, nfds_t nfds, int timeout) { unsigned int i, lklfds = 0, hostfds = 0; @@ -264,10 +265,8 @@ int poll(struct pollfd *fds, nfds_t nfds, int timeout) return lkl_sys_poll((struct lkl_pollfd *)fds, nfds, timeout); } -int __poll(struct pollfd *, nfds_t, int) __attribute__((alias("poll"))); - HOST_CALL(select); -int select(int nfds, fd_set *r, fd_set *w, fd_set *e, struct timeval *t) +int hijack_select(int nfds, fd_set *r, fd_set *w, fd_set *e, struct timeval *t) { int fd, hostfds = 0, lklfds = 0; @@ -324,7 +323,7 @@ int close(int fd) } HOST_CALL(epoll_create); -int epoll_create(int size) +int hijack_epoll_create(int size) { int host_fd; @@ -346,7 +345,7 @@ int epoll_create(int size) } HOST_CALL(epoll_create1); -int epoll_create1(int flags) +int hijack_epoll_create1(int flags) { int host_fd; @@ -369,7 +368,7 @@ int epoll_create1(int flags) HOST_CALL(epoll_ctl); -int epoll_ctl(int epollfd, int op, int fd, struct epoll_event *event) +int hijack_epoll_ctl(int epollfd, int op, int fd, struct epoll_event *event) { CHECK_HOST_CALL(epoll_ctl); @@ -404,7 +403,7 @@ static void *host_epollwait(void *arg) return (void *)(intptr_t)ret; } -int epoll_wait(int epfd, struct epoll_event *events, +int hijack_epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout) { CHECK_HOST_CALL(epoll_wait); @@ -541,7 +540,7 @@ int epoll_wait(int epfd, struct epoll_event *events, return ret; } -int eventfd(unsigned int count, int flags) +int hijack_eventfd(unsigned int count, int flags) { if (!lkl_running) { int (*f)(unsigned int, int) = resolve_sym("eventfd"); @@ -553,7 +552,7 @@ int eventfd(unsigned int count, int flags) } HOST_CALL(eventfd_read); -int eventfd_read(int fd, uint64_t *value) +int hijack_eventfd_read(int fd, uint64_t *value) { CHECK_HOST_CALL(eventfd_read); @@ -565,7 +564,7 @@ int eventfd_read(int fd, uint64_t *value) } HOST_CALL(eventfd_write); -int eventfd_write(int fd, uint64_t value) +int hijack_eventfd_write(int fd, uint64_t value) { CHECK_HOST_CALL(eventfd_write); diff --git a/tools/lkl/lib/hijack/hijack.h b/tools/lkl/lib/hijack/hijack.h new file mode 100644 index 00000000000000..e9f55d8389c591 --- /dev/null +++ b/tools/lkl/lib/hijack/hijack.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include + + +int is_lklfd(int fd); +int hijack_setsockopt(int fd, int level, int optname, const void *optval, + socklen_t optlen); +int hijack_getsockopt(int fd, int level, int optname, void *optval, socklen_t *optlen); +int hijack_poll(struct pollfd *fds, nfds_t nfds, int timeout); +int hijack_select(int nfds, fd_set *r, fd_set *w, fd_set *e, struct timeval *t); +int hijack_eventfd(unsigned int count, int flags); +int hijack_epoll_create(int size); +int hijack_epoll_create1(int flags); +int hijack_epoll_ctl(int epollfd, int op, int fd, struct epoll_event *event); +int hijack_epoll_wait(int epfd, struct epoll_event *events, + int maxevents, int timeout); +int hijack_eventfd_read(int fd, uint64_t *value); +int hijack_eventfd_write(int fd, uint64_t value); diff --git a/tools/lkl/lib/hijack/init.c b/tools/lkl/lib/hijack/init.c index 11ef12010aef45..8228fe6da4fb0a 100644 --- a/tools/lkl/lib/hijack/init.c +++ b/tools/lkl/lib/hijack/init.c @@ -112,8 +112,8 @@ static int config_load(void) return ret; } -void __attribute__((constructor)) -hijack_init(void) +void +__hijack_init(void) { int ret, i, dev_null; int single_cpu_mode = 0; @@ -225,8 +225,17 @@ hijack_init(void) lkl_load_config_post(cfg); } -void __attribute__((destructor)) -hijack_fini(void) +void __attribute__((constructor)) +hijack_init(void) +{ + if (getenv("LKL_HIJACK_ZPOLINE")) + return; + + return __hijack_init(); +} + +void +__hijack_fini(void) { int i; int err; @@ -257,3 +266,12 @@ hijack_fini(void) lkl_cleanup(); } + +void __attribute__((destructor)) +hijack_fini(void) +{ + if (getenv("LKL_HIJACK_ZPOLINE")) + return; + + return __hijack_fini(); +} diff --git a/tools/lkl/lib/hijack/init.h b/tools/lkl/lib/hijack/init.h index be4448f8fe6b9e..2f7a51d3c0a00a 100644 --- a/tools/lkl/lib/hijack/init.h +++ b/tools/lkl/lib/hijack/init.h @@ -4,4 +4,7 @@ extern int lkl_running; extern int dual_fds[]; +void __hijack_init(void); +void __hijack_fini(void); + #endif /*_LKL_HIJACK_INIT_H */ diff --git a/tools/lkl/lib/hijack/preload.c b/tools/lkl/lib/hijack/preload.c new file mode 100644 index 00000000000000..ec0c3d3fd09bfe --- /dev/null +++ b/tools/lkl/lib/hijack/preload.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * system calls hook by LD_PRELOAD + * Copyright (c) 2023 Hajime Tazaki + * + * Author: Hajime Tazaki + * + */ + +#include "hijack.h" + +int setsockopt(int fd, int level, int optname, const void *optval, + socklen_t optlen) +{ + return hijack_setsockopt(fd, level, optname, optval, optlen); +} + +int getsockopt(int fd, int level, int optname, void *optval, socklen_t *optlen) +{ + return hijack_getsockopt(fd, level, optname, optval, optlen); +} + +int poll(struct pollfd *fds, nfds_t nfds, int timeout) +{ + return hijack_poll(fds, nfds, timeout); +} +int __poll(struct pollfd *, nfds_t, int) __attribute__((alias("poll"))); + +int select(int nfds, fd_set *r, fd_set *w, fd_set *e, struct timeval *t) +{ + return hijack_select(nfds, r, w, e, t); +} + +int epoll_create(int size) +{ + return hijack_epoll_create(size); +} + +int epoll_create1(int flags) +{ + return hijack_epoll_create1(flags); +} + +int epoll_ctl(int epollfd, int op, int fd, struct epoll_event *event) +{ + return hijack_epoll_ctl(epollfd, op, fd, event); +} + +int epoll_wait(int epfd, struct epoll_event *events, + int maxevents, int timeout) +{ + return hijack_epoll_wait(epfd, events, maxevents, timeout); +} + +int eventfd(unsigned int count, int flags) +{ + return hijack_eventfd(count, flags); +} + +int eventfd_read(int fd, uint64_t *value) +{ + return hijack_eventfd_read(fd, value); +} + +int eventfd_write(int fd, uint64_t value) +{ + return hijack_eventfd_write(fd, value); +} diff --git a/tools/lkl/lib/hijack/zpoline.c b/tools/lkl/lib/hijack/zpoline.c new file mode 100644 index 00000000000000..12d47c3c2f7fd4 --- /dev/null +++ b/tools/lkl/lib/hijack/zpoline.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * zpoline related code for hijack + * Copyright (c) 2023 Hajime Tazaki + * + * Author: Hajime Tazaki + * + * Note: https://github.com/yasukata/zpoline + */ + +/* zpoline only works on x86_64 architecture */ +#ifdef __x86_64__ +#include +#include + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +#include "init.h" +#include "xlate.h" +#include "hijack.h" + +#include + +/* XXX: include doesn't do the job.. */ +extern __pid_t gettid(void) __THROW; + +#define CALL_LKL_FD_SYSCALL(x) \ +{ \ + case __NR_##x: \ + if (!is_lklfd(a2)) \ + ret = syscall(a1, a2, a3, a4, a5, a6, a7); \ + else { \ + long p[6] = {a2, a3, a4, a5, a6, a7}; \ + ret = lkl_syscall(__lkl__NR_##x, p); \ + } \ + break; \ +} + + +#define ZPOLINE_DEBUG 0 +#define dprintf(fmt, ...) \ + do { \ + if (ZPOLINE_DEBUG) { \ + printf(fmt, ##__VA_ARGS__); \ + } \ + } while (0) + +long zpoline_lkl_hook(int64_t a1, int64_t a2, int64_t a3, + int64_t a4, int64_t a5, int64_t a6, + int64_t a7) +{ + int ret; + + dprintf("syscall %ld: tid: %d\n", a1, gettid()); + if (!lkl_running) { + if (a1 == __NR_clone) { + if (a2 & CLONE_VM) { // pthread creation + /* push return address to the stack */ + a3 -= sizeof(uint64_t); + *((uint64_t *) a3) = a7; + } + } + return syscall(a1, a2, a3, a4, a5, a6); + } + + switch (a1) { + CALL_LKL_FD_SYSCALL(sendmsg); + CALL_LKL_FD_SYSCALL(recvmsg); + CALL_LKL_FD_SYSCALL(sendmmsg); + CALL_LKL_FD_SYSCALL(recvmmsg); + CALL_LKL_FD_SYSCALL(bind); + CALL_LKL_FD_SYSCALL(connect); + CALL_LKL_FD_SYSCALL(getsockopt); + CALL_LKL_FD_SYSCALL(setsockopt); + CALL_LKL_FD_SYSCALL(getsockname); + CALL_LKL_FD_SYSCALL(sendto); + CALL_LKL_FD_SYSCALL(recvfrom); + CALL_LKL_FD_SYSCALL(listen); + CALL_LKL_FD_SYSCALL(accept); + CALL_LKL_FD_SYSCALL(close); + CALL_LKL_FD_SYSCALL(ioctl); + CALL_LKL_FD_SYSCALL(fcntl); + CALL_LKL_FD_SYSCALL(read); + CALL_LKL_FD_SYSCALL(write); + CALL_LKL_FD_SYSCALL(pread64); + case __NR_socket: + ret = lkl_sys_socket(a2, a3, a4); + if (ret < 0) + syscall(a1, a2, a3, a4, a5, a6, a7); + break; + case __NR_openat: + if (!lkl_running) + ret = syscall(a1, a2, a3, a4, a5, a6, a7); + else { + ret = lkl_sys_open((char *)a3, a4, a5); + /* open to host libraries should not hijack */ + if (ret < 0 && (strncmp((char *)a3, "/lib", 4) == 0)) + ret = syscall(a1, a2, a3, a4, a5, a6, a7); + } + break; + case __NR_newfstatat: + if (!lkl_running) + ret = syscall(a1, a2, a3, a4, a5, a6, a7); + else + ret = lkl_sys_newfstatat(a2, (char *)a3, (void *)a4, a5); + break; + case __NR_epoll_create1: + return hijack_epoll_create1(a2); + case __NR_epoll_ctl: + return hijack_epoll_ctl(a2, a3, a4, (void *)a5); + case __NR_epoll_wait: + return hijack_epoll_wait(a2, (void *)a3, a4, a5); + case __NR_poll: + return hijack_poll((void *)a2, a3, a4); + case __NR_select: + return hijack_select(a2, (void *)a3, (void *)a4, (void *)a5, (void *)a6); + case __NR_eventfd2: + return hijack_eventfd(a2, a3); + case __NR_futex: + ret = syscall(a1, a2, a3, a4, a5, a6, a7); + if (ret < 0) + return -errno; + break; + default: + return syscall(a1, a2, a3, a4, a5, a6, a7); + } + + if (ret == LKL_ENOSYS) + fprintf(stderr, "no syscall defined in LKL (syscall=%ld)\n", a1); + + return ret; +} + +void __attribute__((destructor)) +hook_exit(void) +{ + __hijack_fini(); +} + +typedef long (*syscall_fn_t)(long, long, long, long, long, long, long); +int __hook_init(long placeholder __attribute__ ((__unused__)), + void *default_hook) +{ + *((syscall_fn_t *) default_hook) = zpoline_lkl_hook; + + /** + * XXX: this library is expected to be load via dlmopen of zpoline, thus + * we need to patch a workaorund to handle thread specific data. + */ + lkl_change_tls_mode(); + + __hijack_init(); + return 0; +} +#endif /* __x86_64__ */ diff --git a/tools/lkl/tests/hijack-test.sh b/tools/lkl/tests/hijack-test.sh index 164cf53977f0e7..597b6a834bed3e 100755 --- a/tools/lkl/tests/hijack-test.sh +++ b/tools/lkl/tests/hijack-test.sh @@ -686,10 +686,21 @@ if [[ ! -e ${basedir}/lib/hijack/liblkl-hijack.so ]]; then exit 0 fi +if [ -n "${LKL_HIJACK_ZPOLINE}" ] +then + if [ -z "$LKL_HOST_CONFIG_ZPOLINE_DIR" ]; then + lkl_test_plan 0 "zpoline tests" + echo "missing zpoline configuration" + exit $TEST_SKIP + fi + test_header=" (zpoline)" +fi + + if [ -n "$LKL_HOST_CONFIG_ANDROID" ]; then wdir=$ANDROID_WDIR adb_push lib/hijack/liblkl-hijack.so bin/lkl-hijack.sh tests/net-setup.sh \ - tests/run_netperf.sh tests/hijack-test.sh + tests/run_netperf.sh tests/hijack-test.sh tests/autoconf.sh ping="ping" ping6="ping6" hijack="$wdir/bin/lkl-hijack.sh" @@ -713,7 +724,7 @@ VDESWITCH=${wdir}/vde_switch # And make sure we clean up when we're done trap "clear_wdir &>/dev/null" EXIT -lkl_test_plan 5 "hijack basic tests" +lkl_test_plan 5 "hijack basic tests${test_header}" lkl_test_run 1 run_hijack ip addr lkl_test_run 2 run_hijack ip route lkl_test_run 3 test_ping diff --git a/tools/lkl/tests/run.py b/tools/lkl/tests/run.py index 5de0e1f5088b63..d013362cf97648 100755 --- a/tools/lkl/tests/run.py +++ b/tools/lkl/tests/run.py @@ -64,6 +64,7 @@ def end(self, obj): 'lklfuse.sh -t xfs', 'config', 'hijack-test.sh', + 'LKL_HIJACK_ZPOLINE=1 hijack-test.sh', 'disk-vfio-pci.sh -t ext4 run', 'disk-vfio-pci.sh -t btrfs run', 'disk-vfio-pci.sh -t vfat run',