about summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--README.md3
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/.clang-format21
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/.gitignore5
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/.gitmodules6
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/README.md15
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/README.md10
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench-shell.nix51
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_main.sh31
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_native.nix32
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_native_base.sh107
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_vm.nix32
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_vm.sh41
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/commands.sh3
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/config.sh3
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/profile.cu15
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_diff_prio_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_0_start9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_100_tokens9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_20_minutes9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_20_tokens9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_30_minutes9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_30_tokens9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_40_minutes9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_40_tokens9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_50_minutes9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_50_tokens9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_long9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_zero_sleep9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/2_client_diff_prio_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/2_client_server_50_minutes9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_diff_prio_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server_long9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server_zero_sleep9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server_zero_sleep_1_thread9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_diff_prio_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server_long9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server_zero_sleep9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server_zero_sleep_1_thread9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server_long9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server_zero_sleep9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server_zero_sleep_1_thread9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server_long9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server_zero_sleep9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server_zero_sleep_1_thread9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_32_client_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_64_client_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_diff_prio_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_100_tokens9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_20_tokens9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_30_tokens9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_40_tokens9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_50_tokens9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_1_client_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_2_client_diff_prio_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_2_client_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_4_client_diff_prio_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_4_client_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_8_client_diff_prio_server9
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_8_client_server9
-rwxr-xr-xarchive/2025/summer/msc_berkay_eren_ueruen/benchmarks/throughput/throughput_single_bench.sh128
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/throughput/vm_scripts/server.nix74
-rwxr-xr-xarchive/2025/summer/msc_berkay_eren_ueruen/benchmarks/throughput/vm_scripts/throughput_single_vm.sh3
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_client/client.nix73
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/client.sh14
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/client_nix.sh579
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/server.sh26
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/server_nix.sh168
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/vm-client-shell.nix32
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/vm-server-shell.nix32
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_server/server.nix82
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/cuda-shell.nix32
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/flake.nix78
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/Makefile50
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/README.md92
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/client/write.c210
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/hypervisor/hypervisor.cpp134
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/args_client.h84
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/args_server.h106
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/misc.h5
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/shm.h40
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/client_request.h305
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/defines.h23
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/llama_bench_args.h796
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/llama_forwarder.c171
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/llama_server.cpp647
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/shm.cpp81
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/nvidia.nix67
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/pyplots/16-clients-long.py27
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/pyplots/context_switch.py32
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/pyplots/context_switch_combi.py37
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/pyplots/context_switch_ctx.py32
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/pyplots/host_effect.py71
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/pyplots/host_effect_seperat.py94
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/pyplots/implementation_overhead.py41
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/pyplots/llama_bench_overhead.py47
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/pyplots/mock_app_scalability.py33
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/pyplots/mock_impact.py25
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/pyplots/mock_overhead.py40
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/pyplots/scaling.py39
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/pyplots/scaling_bar.py48
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/pyplots/scheduler_n_tokens.py44
-rw-r--r--archive/2025/summer/msc_berkay_eren_ueruen/pyplots/server_client_overhead.py28
-rwxr-xr-xarchive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/bind.sh8
-rwxr-xr-xarchive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/start_qemu.sh1
-rwxr-xr-xarchive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/start_qemu_no_gpu.sh1
-rwxr-xr-xarchive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/unbind.sh8
112 files changed, 5612 insertions, 1 deletions
diff --git a/README.md b/README.md
index 535676d0d..8bab33daf 100644
--- a/README.md
+++ b/README.md
@@ -9,8 +9,9 @@ website](https://dse.in.tum.de/thesis/).
 
 
 Archives:
-- [2024](./archive/2024/README.md)
 - [2025](./archive/2025/README.md)
+- [2024](./archive/2024/README.md)
+
 
 ## Uploading your artifacts to this repository
 
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/.clang-format b/archive/2025/summer/msc_berkay_eren_ueruen/.clang-format
new file mode 100644
index 000000000..739e9d857
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/.clang-format
@@ -0,0 +1,21 @@
+Language: Cpp
+BasedOnStyle: Google
+IndentWidth: 4
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: true
+ColumnLimit: 100
\ No newline at end of file
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/.gitignore b/archive/2025/summer/msc_berkay_eren_ueruen/.gitignore
new file mode 100644
index 000000000..196b82b18
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/.gitignore
@@ -0,0 +1,5 @@
+*.pdf
+*.svg
+*.png
+*.qcow2
+pyplots/venv/
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/.gitmodules b/archive/2025/summer/msc_berkay_eren_ueruen/.gitmodules
new file mode 100644
index 000000000..44885bb62
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "tensorflow-llm-os"]
+	path = tensorflow-llm-os
+	url = git@github.com:TUM-DSE/tensorflow-llm-os.git
+[submodule "llama.cpp"]
+	path = llama.cpp
+	url = git@github.com:TUM-DSE/llama.cpp.git
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/README.md b/archive/2025/summer/msc_berkay_eren_ueruen/README.md
new file mode 100644
index 000000000..b7df2bd8c
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/README.md
@@ -0,0 +1,15 @@
+# llm-os
+Trustworthy on-device LLM inference framework.
+
+### Benchmarks
+We use llama-bench from llama.cpp for benchmarking. The main command looks like this:
+`./llama-bench -m models/meta-llama-3.1-8b-instruct.f16.gguf -p 0 -n 2048,4096 -ngl 999 -o json --progress > output.json`
+
+```
+-m           model to be used
+-p           benchmark for prompt processing, we disable it with zero since we are interested in text generation
+-n           number of text tokens to generate. Each number will be executed as a different experiment
+-ngl         number of layers to offload to GPU. Setting it high ensures all layers are loaded to GPU
+-o           set output file format to json
+--progress   shows the progress of the benchmark during execution
+```
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/README.md b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/README.md
new file mode 100644
index 000000000..25114edd8
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/README.md
@@ -0,0 +1,10 @@
+The complete benchmark suite can be started by calling the script `bench_main.sh`. If no model file is specified, script will ask for one. It is possible to use any model that llama.cpp supports.
+
+During its execution, `bench_main.sh` will first run native tests by calling `bench_native.sh`. Later on, tests that require VMs will be run via the script `bench_vm.sh`. The latter will spawn two VMs using QEMU, one with a GPU and one without. Created resuts will be written to the `results` directory in the root of the `benchmark` folder.
+
+It is worth noting that both `bench_native.sh` and `bench_vm.sh` can also called seperatly.
+
+Details of the each test resting inside the scripts can be found either directly in the scripts, or in the scripts that they call. For the native tests, the exact information can be found in `bench_native.sh`. The details for VM tests can be found in `vm_scripts` directory.
+
+### Issues
+Phoronix test suite has some weird test result saving behaviour. In some cases, it might be possible that the result written to `results` folder is not the latest result. However, inside the VM, it is possible to find all the results so far. Therefore getting the Phoronix results from the VM itself is more reliable. This can be done by launching the VM inside the `vm_client` folder via the command `nixos-shell client.nix`.
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench-shell.nix b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench-shell.nix
new file mode 100644
index 000000000..5e866b998
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench-shell.nix
@@ -0,0 +1,51 @@
+#{ pkgs ? import /ssd_extension/teofil/nixpkgs {} }:
+#{ pkgs ? import <nixpkgs> {cudaSupport = true;} }:
+#{pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/8b27c1239e5c421a2bbc2c65d52e4a6fbf2ff296.tar.gz") {} }:   
+{pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/63dacb46bf939521bdc93981b4cbb7ecb58427a0.tar.gz") {} }:
+
+(pkgs.buildFHSUserEnv {
+ name = "cudazone";
+ targetPkgs = pkgs: (with pkgs; [
+		 gcc12
+		 gdb
+		 ccache
+#		 cudaPackages.nsight_systems
+#		 cudaPackages.cuda_nvcc
+		 cudatoolkit
+		 cudaPackages.cuda_cudart.stubs
+		 git-lfs
+		 glibc.dev
+		 gnumake
+		 cmake
+		 man
+		 go
+		 unzip
+		 curl.dev
+		 phoronix-test-suite
+		 python3
+		 php
+		 nixos-shell
+		 fzf
+		# Bench
+		rustc
+		perl
+		pcre.dev
+		libxml2
+		bc
+		bison
+		flex
+		openssl
+		yasm
+		nasm
+		libuuid
+		bzip2.dev
+		scons
+		ncurses
+		glew.dev
+		protobuf
+		boost
+		gnupatch
+ ]);
+ runScript = "bash";
+ }).env
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_main.sh b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_main.sh
new file mode 100644
index 000000000..33de4e4e9
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_main.sh
@@ -0,0 +1,31 @@
+source config.sh
+
+#echo $(taskset -pc 0-"$N_CPUS" $$)
+
+# save the last result
+mkdir -p ./results/backup
+mv ./results/* ./results/backup
+
+cd ..
+git submodule update --init --recursive
+cd -
+
+# check if a model exists
+if [ ! -f ./models/model_file ]; then
+    echo "Model file not found! Plese put a model in 'models' directory with name 'model_file'"
+    exit
+fi
+
+# bind the GPU
+sudo bash ../qemu_scripts/bind.sh;
+
+# test the base with phoronix:
+# native openssl bench WITHOUT server and clients
+nix-shell bench_native.nix
+
+# unbind the GPU
+sudo bash ../qemu_scripts/unbind.sh;
+
+# test server VM and client VM with phoronix
+# phoronix is on client side
+nix-shell ./bench_vm.nix
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_native.nix b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_native.nix
new file mode 100644
index 000000000..769799d02
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_native.nix
@@ -0,0 +1,32 @@
+#{ pkgs ? import /ssd_extension/teofil/nixpkgs {} }:
+#{ pkgs ? import <nixpkgs> {cudaSupport = true;} }:
+#{pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/8b27c1239e5c421a2bbc2c65d52e4a6fbf2ff296.tar.gz") {} }:   
+{pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/63dacb46bf939521bdc93981b4cbb7ecb58427a0.tar.gz") {} }:   
+
+(pkgs.buildFHSUserEnv {
+ name = "cudazone";
+ targetPkgs = pkgs: (with pkgs; [
+		 gcc12
+		 gdb
+		 ccache
+#		 cudaPackages.nsight_systems
+#		 cudaPackages.cuda_nvcc
+		 cudatoolkit
+		 cudaPackages.cuda_cudart.stubs
+		 git-lfs
+		 glibc.dev
+		 gnumake
+		 cmake
+		 man
+		 go
+		 unzip
+		 curl.dev
+		 phoronix-test-suite
+		 python3
+		 php
+		 nixos-shell
+		 fzf
+ ]);
+runScript = "bash ./bench_native_base.sh";
+ }).env
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_native_base.sh b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_native_base.sh
new file mode 100644
index 000000000..79609f4bf
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_native_base.sh
@@ -0,0 +1,107 @@
+set -x
+
+source ./config.sh
+
+SHARED_MEM_LOCATION="/dev/shm/shmBench"
+
+wait_for_server () {
+  until [ -f "../benchmarks/server_signals/test$1" ]
+  do
+     sleep 3
+  done
+}
+
+source N_THREADS=16
+echo "$N_THREADS"
+
+# TEST 1.1: phoronix native
+export TEST_RESULTS_NAME=openssl-native-no-clients
+export OUTPUT_DIR=$(pwd)/results
+
+printf 'y\nn\nn\nn\nn\nn\nn\n' | phoronix-test-suite batch-setup
+phoronix-test-suite batch-install pts/openssl
+
+# start the benchmark
+# 2 defines the encryption
+printf '2\n' | phoronix-test-suite batch-run pts/openssl
+
+# save the result to shared directory
+phoronix-test-suite result-file-to-csv $TEST_RESULTS_NAME
+
+# TEST 1.2.1 server (GPU) <-native-> client
+# compile llama.cpp
+export GGML_CUDA=1
+cd ../llama.cpp
+make clean
+make libllama.so
+make libcommon.a
+make libggml.so
+
+# compile server/client
+cd ../intervm_comm
+make clean
+make
+
+# get the shared memory ready
+rm $SHARED_MEM_LOCATION
+touch $SHARED_MEM_LOCATION
+
+# clean signals
+mkdir -p ../benchmarks/server_signals
+rm ../benchmarks/server_signals/*
+
+./server -m ../benchmarks/models/model_file -p ../benchmarks/results/native_gpu_1_client_server -f ../benchmarks/server_signals/test1_2_1 --shared-mem=$SHARED_MEM_LOCATION --n_threads=$N_THREADS &
+wait_for_server "1_2_1"
+./client -i 1 -p 1 --sleep 0 --shared-mem=$SHARED_MEM_LOCATION &
+CLIENT_PID=$!
+sleep 300
+
+# kill the server and clients
+./client -w --shared-mem=$SHARED_MEM_LOCATION
+kill $CLIENT_PID
+
+# TEST 1.2.2 server (CPU) <-native-> client
+# compile llama.cpp
+unset GGML_CUDA
+cd ../llama.cpp
+make clean
+make libllama.so -j8
+make libcommon.a -j8
+make libggml.so -j8
+
+# compile server/client
+cd ../intervm_comm
+make clean
+make
+
+# get the shared memory ready
+rm $SHARED_MEM_LOCATION
+touch $SHARED_MEM_LOCATION
+
+# clean signals
+mkdir -p ../benchmarks/server_signals
+rm ../benchmarks/server_signals/*
+
+./server -m ../benchmarks/models/model_file -p ../benchmarks/results/native_cpu_1_client_server -f ../benchmarks/server_signals/test1_2_2 --shared-mem=$SHARED_MEM_LOCATION --n_threads=$N_THREADS &
+wait_for_server "1_2_2"
+./client -i 1 -p 1 --sleep 0 --shared-mem=$SHARED_MEM_LOCATION &
+CLIENT_PID=$!
+sleep 300
+
+# kill the server and clients
+./client -w --shared-mem=$SHARED_MEM_LOCATION
+kill $CLIENT_PID
+
+# TEST 1.3: llama-bench cpu
+cd ../llama.cpp
+unset GGML_CUDA
+make clean
+make llama-bench -j8
+./llama-bench -m ../benchmarks/models/model_file -t $N_THREADS -p 0 -n 64,128,256,512,1024 -ngl 0 -o json --progress > ../benchmarks/results/llama_bench_Q4_K_M_cpu_native.json
+
+# TEST 1.4: llama-bench gpu
+export GGML_CUDA=1
+make clean
+make llama-bench -j8
+./llama-bench -m ../benchmarks/models/model_file -t $N_THREADS -p 0 -n 64,128,256,512,1024 -ngl 999 -o json --progress > ../benchmarks/results/llama_bench_Q4_K_M_gpu_native.json
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_vm.nix b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_vm.nix
new file mode 100644
index 000000000..1f6a44bbf
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_vm.nix
@@ -0,0 +1,32 @@
+#{ pkgs ? import /ssd_extension/teofil/nixpkgs {} }:
+#{ pkgs ? import <nixpkgs> {cudaSupport = true;} }:
+#{pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/8b27c1239e5c421a2bbc2c65d52e4a6fbf2ff296.tar.gz") {} }:   
+{pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/63dacb46bf939521bdc93981b4cbb7ecb58427a0.tar.gz") {} }:   
+
+(pkgs.buildFHSUserEnv {
+ name = "cudazone";
+ targetPkgs = pkgs: (with pkgs; [
+		 gcc12
+		 gdb
+		 ccache
+#		 cudaPackages.nsight_systems
+#		 cudaPackages.cuda_nvcc
+		 cudatoolkit
+		 cudaPackages.cuda_cudart.stubs
+		 git-lfs
+		 glibc.dev
+		 gnumake
+		 cmake
+		 man
+		 go
+		 unzip
+		 curl.dev
+		 phoronix-test-suite
+		 python3
+		 php
+		 nixos-shell
+		 fzf
+ ]);
+runScript = "bash ./bench_vm.sh";
+ }).env
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_vm.sh b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_vm.sh
new file mode 100644
index 000000000..af992dec1
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/bench_vm.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+SERVER_PORT=2345
+CLIENT_PORT=2346
+
+# unbind the GPU from the host
+sudo ../qemu_scripts/unbind.sh;
+
+# create directories
+mkdir -p results # this is redundant but nice to have here in case this script is called standalone
+mkdir -p server_signals
+
+# start VMs
+cd vm_server
+nixos-shell server.nix &
+
+# Connect to LLMOS and start the server
+while ! ssh -o StrictHostKeyChecking=no -p $SERVER_PORT root@localhost 'echo "Server VM live"'
+do
+    sleep 3
+done
+
+while ! ssh -o StrictHostKeyChecking=no -p $SERVER_PORT root@localhost 'bash /vm_scripts/server.sh' > server_output
+do
+    sleep 3
+done &
+
+cd ../vm_client
+nixos-shell client.nix &
+
+# Start the client
+while ! ssh -o StrictHostKeyChecking=no -p $CLIENT_PORT root@localhost 'echo "Client VM live"'
+do
+    sleep 3
+done
+
+while ! ssh -o StrictHostKeyChecking=no -p $CLIENT_PORT root@localhost 'bash /vm_scripts/client.sh' > client_output
+do
+    sleep 3
+done &
+
+echo "SUCCESS!"
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/commands.sh b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/commands.sh
new file mode 100644
index 000000000..b153d1971
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/commands.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+../llama.cpp/llama-bench -m ../models/Meta-Llama-3-8B-Instruct-GGUF/Meta-Llama-3-8B-Instruct.Q2_K.gguf -p 0 -n 64,128,256,512,1024 -o json -b 256,512,1024  -ngl 40,80,160 -ctk f16 -ctv f16  -nkvo 0,1 -fa 0,1 --progress > llama_Q2_f16kv.json
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/config.sh b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/config.sh
new file mode 100644
index 000000000..4b195e117
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/config.sh
@@ -0,0 +1,3 @@
+N_THREADS=8
+N_CPUS=8
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/profile.cu b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/profile.cu
new file mode 100644
index 000000000..efc6f5de0
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/profile.cu
@@ -0,0 +1,15 @@
+int main()
+{
+    const unsigned int N = 1048576;
+    const unsigned int bytes = N * sizeof(int);
+    int *h_a = (int*)malloc(bytes);
+    int *d_a;
+    cudaMalloc((int**)&d_a, bytes);
+
+    memset(h_a, 0, bytes);
+    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
+    cudaMemcpy(h_a, d_a, bytes, cudaMemcpyDeviceToHost);
+
+    return 0;
+}
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_diff_prio_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_diff_prio_server
new file mode 100644
index 000000000..9645317c4
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_diff_prio_server
@@ -0,0 +1,9 @@
+Tokens generated: 1564
+Time spent: 618.193
+Throughput (t/s): 2.52995
+Time spent on context creation: 735ms
+Number of context creations: 7
+Average context creation time: 105ms
+Time spent on context switches: 0ms
+Number of context switches: 0
+Average context switch time: 0ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server
new file mode 100644
index 000000000..4ec0354c3
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server
@@ -0,0 +1,9 @@
+Tokens generated: 641
+Time spent: 622.791
+Throughput (t/s): 1.02924
+Time spent on context creation: 1602ms
+Number of context creations: 16
+Average context creation time: 100ms
+Time spent on context switches: 4719ms
+Number of context switches: 48
+Average context switch time: 98ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_0_start b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_0_start
new file mode 100644
index 000000000..4ec0354c3
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_0_start
@@ -0,0 +1,9 @@
+Tokens generated: 641
+Time spent: 622.791
+Throughput (t/s): 1.02924
+Time spent on context creation: 1602ms
+Number of context creations: 16
+Average context creation time: 100ms
+Time spent on context switches: 4719ms
+Number of context switches: 48
+Average context switch time: 98ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_100_tokens b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_100_tokens
new file mode 100644
index 000000000..659ce33f0
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_100_tokens
@@ -0,0 +1,9 @@
+Tokens generated: 1534
+Time spent: 618.118
+Throughput (t/s): 2.48173
+Time spent on context creation: 1475ms
+Number of context creations: 15
+Average context creation time: 98ms
+Time spent on context switches: 0ms
+Number of context switches: 0
+Average context switch time: 0ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_20_minutes b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_20_minutes
new file mode 100644
index 000000000..538bc29c2
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_20_minutes
@@ -0,0 +1,9 @@
+Tokens generated: 960
+Time spent: 1208.99
+Throughput (t/s): 0.79405
+Time spent on context creation: 1723ms
+Number of context creations: 16
+Average context creation time: 107ms
+Time spent on context switches: 8236ms
+Number of context switches: 80
+Average context switch time: 102ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_20_tokens b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_20_tokens
new file mode 100644
index 000000000..b44d5c47b
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_20_tokens
@@ -0,0 +1,9 @@
+Tokens generated: 841
+Time spent: 623.756
+Throughput (t/s): 1.34828
+Time spent on context creation: 1572ms
+Number of context creations: 16
+Average context creation time: 98ms
+Time spent on context switches: 2541ms
+Number of context switches: 26
+Average context switch time: 97ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_30_minutes b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_30_minutes
new file mode 100644
index 000000000..e903dad44
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_30_minutes
@@ -0,0 +1,9 @@
+Tokens generated: 1211
+Time spent: 1824.72
+Throughput (t/s): 0.663664
+Time spent on context creation: 1725ms
+Number of context creations: 16
+Average context creation time: 107ms
+Time spent on context switches: 13120ms
+Number of context switches: 105
+Average context switch time: 124ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_30_tokens b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_30_tokens
new file mode 100644
index 000000000..2606f8d3e
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_30_tokens
@@ -0,0 +1,9 @@
+Tokens generated: 1012
+Time spent: 616.917
+Throughput (t/s): 1.64041
+Time spent on context creation: 1578ms
+Number of context creations: 16
+Average context creation time: 98ms
+Time spent on context switches: 1660ms
+Number of context switches: 17
+Average context switch time: 97ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_40_minutes b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_40_minutes
new file mode 100644
index 000000000..ddda1ea56
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_40_minutes
@@ -0,0 +1,9 @@
+Tokens generated: 1391
+Time spent: 2423.65
+Throughput (t/s): 0.573927
+Time spent on context creation: 1812ms
+Number of context creations: 16
+Average context creation time: 113ms
+Time spent on context switches: 15523ms
+Number of context switches: 123
+Average context switch time: 126ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_40_tokens b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_40_tokens
new file mode 100644
index 000000000..847eaecfb
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_40_tokens
@@ -0,0 +1,9 @@
+Tokens generated: 1121
+Time spent: 622.501
+Throughput (t/s): 1.8008
+Time spent on context creation: 1581ms
+Number of context creations: 16
+Average context creation time: 98ms
+Time spent on context switches: 1166ms
+Number of context switches: 12
+Average context switch time: 97ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_50_minutes b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_50_minutes
new file mode 100644
index 000000000..cb6f9907d
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_50_minutes
@@ -0,0 +1,9 @@
+Tokens generated: 1591
+Time spent: 3029.49
+Throughput (t/s): 0.525171
+Time spent on context creation: 1617ms
+Number of context creations: 16
+Average context creation time: 101ms
+Time spent on context switches: 14274ms
+Number of context switches: 143
+Average context switch time: 99ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_50_tokens b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_50_tokens
new file mode 100644
index 000000000..95458738e
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_50_tokens
@@ -0,0 +1,9 @@
+Tokens generated: 1151
+Time spent: 618.558
+Throughput (t/s): 1.86078
+Time spent on context creation: 1567ms
+Number of context creations: 16
+Average context creation time: 97ms
+Time spent on context switches: 677ms
+Number of context switches: 7
+Average context switch time: 96ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_long b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_long
new file mode 100644
index 000000000..e18981ca4
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_long
@@ -0,0 +1,9 @@
+Tokens generated: 171
+Time spent: 648.466
+Throughput (t/s): 0.263699
+Time spent on context creation: 1604ms
+Number of context creations: 16
+Average context creation time: 100ms
+Time spent on context switches: 99ms
+Number of context switches: 1
+Average context switch time: 99ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_zero_sleep b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_zero_sleep
new file mode 100644
index 000000000..b1aeb142e
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/16_client_server_zero_sleep
@@ -0,0 +1,9 @@
+Tokens generated: 611
+Time spent: 617.51
+Throughput (t/s): 0.989458
+Time spent on context creation: 1718ms
+Number of context creations: 16
+Average context creation time: 107ms
+Time spent on context switches: 5396ms
+Number of context switches: 45
+Average context switch time: 119ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/2_client_diff_prio_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/2_client_diff_prio_server
new file mode 100644
index 000000000..e733a5616
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/2_client_diff_prio_server
@@ -0,0 +1,9 @@
+Tokens generated: 1824
+Time spent: 604.186
+Throughput (t/s): 3.01894
+Time spent on context creation: 673ms
+Number of context creations: 6
+Average context creation time: 112ms
+Time spent on context switches: 333ms
+Number of context switches: 3
+Average context switch time: 111ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/2_client_server_50_minutes b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/2_client_server_50_minutes
new file mode 100644
index 000000000..0d260dd31
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/2_client_server_50_minutes
@@ -0,0 +1,9 @@
+Tokens generated: 631
+Time spent: 3009.73
+Throughput (t/s): 0.209654
+Time spent on context creation: 235ms
+Number of context creations: 2
+Average context creation time: 117ms
+Time spent on context switches: 5964ms
+Number of context switches: 61
+Average context switch time: 97ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_diff_prio_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_diff_prio_server
new file mode 100644
index 000000000..d44f9ab68
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_diff_prio_server
@@ -0,0 +1,9 @@
+Tokens generated: 1839
+Time spent: 606.178
+Throughput (t/s): 3.03376
+Time spent on context creation: 896ms
+Number of context creations: 8
+Average context creation time: 112ms
+Time spent on context switches: 111ms
+Number of context switches: 1
+Average context switch time: 111ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server
new file mode 100644
index 000000000..6d63e3c01
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server
@@ -0,0 +1,9 @@
+Tokens generated: 381
+Time spent: 623.447
+Throughput (t/s): 0.611119
+Time spent on context creation: 392ms
+Number of context creations: 4
+Average context creation time: 98ms
+Time spent on context switches: 3746ms
+Number of context switches: 34
+Average context switch time: 110ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server_long b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server_long
new file mode 100644
index 000000000..73670c449
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server_long
@@ -0,0 +1,9 @@
+Tokens generated: 171
+Time spent: 634.985
+Throughput (t/s): 0.269298
+Time spent on context creation: 422ms
+Number of context creations: 4
+Average context creation time: 105ms
+Time spent on context switches: 1288ms
+Number of context switches: 13
+Average context switch time: 99ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server_zero_sleep b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server_zero_sleep
new file mode 100644
index 000000000..de848796e
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server_zero_sleep
@@ -0,0 +1,9 @@
+Tokens generated: 381
+Time spent: 623.173
+Throughput (t/s): 0.611387
+Time spent on context creation: 443ms
+Number of context creations: 4
+Average context creation time: 110ms
+Time spent on context switches: 3680ms
+Number of context switches: 34
+Average context switch time: 108ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server_zero_sleep_1_thread b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server_zero_sleep_1_thread
new file mode 100644
index 000000000..fc6b0481b
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/4_client_server_zero_sleep_1_thread
@@ -0,0 +1,9 @@
+Tokens generated: 76
+Time spent: 607.875
+Throughput (t/s): 0.125026
+Time spent on context creation: 422ms
+Number of context creations: 4
+Average context creation time: 105ms
+Time spent on context switches: 298ms
+Number of context switches: 3
+Average context switch time: 99ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_diff_prio_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_diff_prio_server
new file mode 100644
index 000000000..e2626e598
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_diff_prio_server
@@ -0,0 +1,9 @@
+Tokens generated: 1838
+Time spent: 610.586
+Throughput (t/s): 3.01022
+Time spent on context creation: 1026ms
+Number of context creations: 9
+Average context creation time: 114ms
+Time spent on context switches: 0ms
+Number of context switches: 0
+Average context switch time: 0ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server
new file mode 100644
index 000000000..6270196d2
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server
@@ -0,0 +1,9 @@
+Tokens generated: 511
+Time spent: 623.323
+Throughput (t/s): 0.8198
+Time spent on context creation: 780ms
+Number of context creations: 8
+Average context creation time: 97ms
+Time spent on context switches: 4158ms
+Number of context switches: 43
+Average context switch time: 96ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server_long b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server_long
new file mode 100644
index 000000000..d3d79547f
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server_long
@@ -0,0 +1,9 @@
+Tokens generated: 181
+Time spent: 632.623
+Throughput (t/s): 0.28611
+Time spent on context creation: 808ms
+Number of context creations: 8
+Average context creation time: 101ms
+Time spent on context switches: 990ms
+Number of context switches: 10
+Average context switch time: 99ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server_zero_sleep b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server_zero_sleep
new file mode 100644
index 000000000..c58a2ac21
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server_zero_sleep
@@ -0,0 +1,9 @@
+Tokens generated: 511
+Time spent: 623.056
+Throughput (t/s): 0.82015
+Time spent on context creation: 839ms
+Number of context creations: 8
+Average context creation time: 104ms
+Time spent on context switches: 4524ms
+Number of context switches: 43
+Average context switch time: 105ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server_zero_sleep_1_thread b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server_zero_sleep_1_thread
new file mode 100644
index 000000000..18f2a3f6e
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/8_client_server_zero_sleep_1_thread
@@ -0,0 +1,9 @@
+Tokens generated: 91
+Time spent: 665.463
+Throughput (t/s): 0.136747
+Time spent on context creation: 825ms
+Number of context creations: 8
+Average context creation time: 103ms
+Time spent on context switches: 102ms
+Number of context switches: 1
+Average context switch time: 102ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server
new file mode 100644
index 000000000..1c9d20ba5
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server
@@ -0,0 +1,9 @@
+Tokens generated: 1852
+Time spent: 601.666
+Throughput (t/s): 3.07812
+Time spent on context creation: 560ms
+Number of context creations: 5
+Average context creation time: 112ms
+Time spent on context switches: 0ms
+Number of context switches: 0
+Average context switch time: 0ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server_long b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server_long
new file mode 100644
index 000000000..07f81bd12
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server_long
@@ -0,0 +1,9 @@
+Tokens generated: 1415
+Time spent: 591.995
+Throughput (t/s): 2.39022
+Time spent on context creation: 494ms
+Number of context creations: 4
+Average context creation time: 123ms
+Time spent on context switches: 0ms
+Number of context switches: 0
+Average context switch time: 0ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server_zero_sleep b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server_zero_sleep
new file mode 100644
index 000000000..a2c3148c2
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server_zero_sleep
@@ -0,0 +1,9 @@
+Tokens generated: 1814
+Time spent: 593.899
+Throughput (t/s): 3.05439
+Time spent on context creation: 646ms
+Number of context creations: 5
+Average context creation time: 129ms
+Time spent on context switches: 0ms
+Number of context switches: 0
+Average context switch time: 0ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server_zero_sleep_1_thread b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server_zero_sleep_1_thread
new file mode 100644
index 000000000..b2a63b24f
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_1_client_server_zero_sleep_1_thread
@@ -0,0 +1,9 @@
+Tokens generated: 122
+Time spent: 599.439
+Throughput (t/s): 0.203524
+Time spent on context creation: 151ms
+Number of context creations: 1
+Average context creation time: 151ms
+Time spent on context switches: 0ms
+Number of context switches: 0
+Average context switch time: 0ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server
new file mode 100644
index 000000000..35b9926fa
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server
@@ -0,0 +1,9 @@
+Tokens generated: 271
+Time spent: 608.658
+Throughput (t/s): 0.445242
+Time spent on context creation: 227ms
+Number of context creations: 2
+Average context creation time: 113ms
+Time spent on context switches: 2794ms
+Number of context switches: 25
+Average context switch time: 111ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server_long b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server_long
new file mode 100644
index 000000000..219cd8413
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server_long
@@ -0,0 +1,9 @@
+Tokens generated: 151
+Time spent: 637.763
+Throughput (t/s): 0.236765
+Time spent on context creation: 223ms
+Number of context creations: 2
+Average context creation time: 111ms
+Time spent on context switches: 1415ms
+Number of context switches: 13
+Average context switch time: 108ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server_zero_sleep b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server_zero_sleep
new file mode 100644
index 000000000..3896911a9
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server_zero_sleep
@@ -0,0 +1,9 @@
+Tokens generated: 271
+Time spent: 609.287
+Throughput (t/s): 0.444782
+Time spent on context creation: 222ms
+Number of context creations: 2
+Average context creation time: 111ms
+Time spent on context switches: 2757ms
+Number of context switches: 25
+Average context switch time: 110ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server_zero_sleep_1_thread b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server_zero_sleep_1_thread
new file mode 100644
index 000000000..736cc2064
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_2_client_server_zero_sleep_1_thread
@@ -0,0 +1,9 @@
+Tokens generated: 61
+Time spent: 690.723
+Throughput (t/s): 0.0883132
+Time spent on context creation: 198ms
+Number of context creations: 2
+Average context creation time: 99ms
+Time spent on context switches: 400ms
+Number of context switches: 4
+Average context switch time: 100ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_32_client_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_32_client_server
new file mode 100644
index 000000000..119dc9e36
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_32_client_server
@@ -0,0 +1,9 @@
+Tokens generated: 455
+Time spent: 300.21
+Throughput (t/s): 1.5156
+Time spent on context creation: 3387ms
+Number of context creations: 32
+Average context creation time: 105ms
+Time spent on context switches: 1365ms
+Number of context switches: 13
+Average context switch time: 105ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_64_client_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_64_client_server
new file mode 100644
index 000000000..bc75b0411
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/cpu_64_client_server
@@ -0,0 +1,9 @@
+Tokens generated: 1047
+Time spent: 667.081
+Throughput (t/s): 1.56952
+Time spent on context creation: 6348ms
+Number of context creations: 64
+Average context creation time: 99ms
+Time spent on context switches: 3970ms
+Number of context switches: 40
+Average context switch time: 99ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_diff_prio_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_diff_prio_server
new file mode 100644
index 000000000..0823e05be
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_diff_prio_server
@@ -0,0 +1,9 @@
+Tokens generated: 45341
+Time spent: 616.844
+Throughput (t/s): 73.5048
+Time spent on context creation: 977ms
+Number of context creations: 104
+Average context creation time: 9ms
+Time spent on context switches: 11089ms
+Number of context switches: 1199
+Average context switch time: 9ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server
new file mode 100644
index 000000000..9c7004910
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server
@@ -0,0 +1,9 @@
+Tokens generated: 32368
+Time spent: 617.118
+Throughput (t/s): 52.4502
+Time spent on context creation: 773ms
+Number of context creations: 74
+Average context creation time: 10ms
+Time spent on context switches: 33671ms
+Number of context switches: 3184
+Average context switch time: 10ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_100_tokens b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_100_tokens
new file mode 100644
index 000000000..fe5e0a775
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_100_tokens
@@ -0,0 +1,9 @@
+Tokens generated: 48797
+Time spent: 616.934
+Throughput (t/s): 79.096
+Time spent on context creation: 1274ms
+Number of context creations: 116
+Average context creation time: 10ms
+Time spent on context switches: 4688ms
+Number of context switches: 432
+Average context switch time: 10ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_20_tokens b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_20_tokens
new file mode 100644
index 000000000..d0ef4c1e2
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_20_tokens
@@ -0,0 +1,9 @@
+Tokens generated: 40197
+Time spent: 636.68
+Throughput (t/s): 63.1353
+Time spent on context creation: 850ms
+Number of context creations: 93
+Average context creation time: 9ms
+Time spent on context switches: 18120ms
+Number of context switches: 1954
+Average context switch time: 9ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_30_tokens b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_30_tokens
new file mode 100644
index 000000000..17e0e8bd4
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_30_tokens
@@ -0,0 +1,9 @@
+Tokens generated: 43629
+Time spent: 617.077
+Throughput (t/s): 70.7027
+Time spent on context creation: 1039ms
+Number of context creations: 105
+Average context creation time: 9ms
+Time spent on context switches: 13995ms
+Number of context switches: 1394
+Average context switch time: 10ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_40_tokens b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_40_tokens
new file mode 100644
index 000000000..e79994cd5
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_40_tokens
@@ -0,0 +1,9 @@
+Tokens generated: 45517
+Time spent: 616.822
+Throughput (t/s): 73.7928
+Time spent on context creation: 1176ms
+Number of context creations: 107
+Average context creation time: 10ms
+Time spent on context switches: 11900ms
+Number of context switches: 1083
+Average context switch time: 10ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_50_tokens b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_50_tokens
new file mode 100644
index 000000000..9f0ff211d
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_16_client_server_50_tokens
@@ -0,0 +1,9 @@
+Tokens generated: 47157
+Time spent: 616.93
+Throughput (t/s): 76.4382
+Time spent on context creation: 1104ms
+Number of context creations: 119
+Average context creation time: 9ms
+Time spent on context switches: 8042ms
+Number of context switches: 868
+Average context switch time: 9ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_1_client_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_1_client_server
new file mode 100644
index 000000000..3b1d19664
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_1_client_server
@@ -0,0 +1,9 @@
+Tokens generated: 0
+Time spent: 103683
+Throughput (t/s): 0
+Time spent on context creation: 0ms
+Number of context creations: 0
+Average context creation time: 0ms
+Time spent on context switches: 0ms
+Number of context switches: 0
+Average context switch time: 0ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_2_client_diff_prio_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_2_client_diff_prio_server
new file mode 100644
index 000000000..59ee0a03f
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_2_client_diff_prio_server
@@ -0,0 +1,9 @@
+Tokens generated: 48650
+Time spent: 602.952
+Throughput (t/s): 80.6864
+Time spent on context creation: 1097ms
+Number of context creations: 109
+Average context creation time: 10ms
+Time spent on context switches: 394ms
+Number of context switches: 36
+Average context switch time: 10ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_2_client_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_2_client_server
new file mode 100644
index 000000000..bc691a4d3
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_2_client_server
@@ -0,0 +1,9 @@
+Tokens generated: 32445
+Time spent: 602.592
+Throughput (t/s): 53.8424
+Time spent on context creation: 806ms
+Number of context creations: 77
+Average context creation time: 10ms
+Time spent on context switches: 32304ms
+Number of context switches: 3112
+Average context switch time: 10ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_4_client_diff_prio_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_4_client_diff_prio_server
new file mode 100644
index 000000000..b2a616d0f
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_4_client_diff_prio_server
@@ -0,0 +1,9 @@
+Tokens generated: 42499
+Time spent: 601.264
+Throughput (t/s): 70.6828
+Time spent on context creation: 1127ms
+Number of context creations: 94
+Average context creation time: 11ms
+Time spent on context switches: 12329ms
+Number of context switches: 1046
+Average context switch time: 11ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_4_client_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_4_client_server
new file mode 100644
index 000000000..ed6e60327
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_4_client_server
@@ -0,0 +1,9 @@
+Tokens generated: 31620
+Time spent: 604.68
+Throughput (t/s): 52.2921
+Time spent on context creation: 631ms
+Number of context creations: 69
+Average context creation time: 9ms
+Time spent on context switches: 28181ms
+Number of context switches: 3089
+Average context switch time: 9ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_8_client_diff_prio_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_8_client_diff_prio_server
new file mode 100644
index 000000000..c3b9cbac9
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_8_client_diff_prio_server
@@ -0,0 +1,9 @@
+Tokens generated: 44101
+Time spent: 608.137
+Throughput (t/s): 72.5182
+Time spent on context creation: 1083ms
+Number of context creations: 100
+Average context creation time: 10ms
+Time spent on context switches: 12102ms
+Number of context switches: 1147
+Average context switch time: 10ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_8_client_server b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_8_client_server
new file mode 100644
index 000000000..49e58fde9
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/results/gpu_8_client_server
@@ -0,0 +1,9 @@
+Tokens generated: 32034
+Time spent: 608.872
+Throughput (t/s): 52.612
+Time spent on context creation: 762ms
+Number of context creations: 74
+Average context creation time: 10ms
+Time spent on context switches: 32721ms
+Number of context switches: 3157
+Average context switch time: 10ms
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/throughput/throughput_single_bench.sh b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/throughput/throughput_single_bench.sh
new file mode 100755
index 000000000..6f2c4a938
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/throughput/throughput_single_bench.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+
+# Function to display usage information
+# TODO: Fill in options
+function usage() {
+    echo "Usage: $0 -m model1,model2,... -n n_to_gen1,n_to_gen2, -p prompt_file"
+	echo "Prompt_file contains 1 prompt on every line"
+	echo "This script runs the throughput benchmark using a single request."
+    exit 1  
+}
+
+# Parse command line arguments
+# getopts is a bash built-in for processing command-line options
+# "m:p:n:" means we expect options -m, -p, and -n, each with a required argument (indicated by the colon)
+while getopts "m:p:n:" opt; do
+    case $opt in
+        m) IFS=',' read -ra MODELS <<< "$OPTARG"    # Split comma-separated -m argument into MODELS array
+           ;;
+        p) IFS=',' read -ra PROMPT_FILE <<< "$OPTARG"    
+           ;;
+        n) IFS=',' read -ra TOKENS_TO_GENERATE <<< "$OPTARG"  
+           ;;
+        *) usage ;;  # Call usage() if an unknown option is provided
+    esac
+done
+
+# Check if all required parameters are provided
+# ${#ARRAY[@]} gives the length of the array
+if [ ${#MODELS[@]} -eq 0 ] || [ ${#PROMPT_FILE[@]} -ne 1 ] || [ ${#TOKENS_TO_GENERATE[@]} -eq 0 ]; then
+    echo "Error: All parameters (-m, -p, -n) are required."
+    usage  # Show usage and exit if any parameter is missing
+fi
+
+# Trim whitespace from array elements
+# This removes leading and trailing spaces from each element
+for i in "${!MODELS[@]}"; do  # ${!ARRAY[@]} returns the indices of the array
+    MODELS[$i]=$(echo "${MODELS[$i]}" | xargs)  # xargs with no arguments trims whitespace
+done
+
+for i in "${!PROMPT_FILE[@]}"; do
+    PROMPT_FILE[$i]=$(echo "${PROMPT_FILE[$i]}" | xargs)
+done
+
+for i in "${!TOKENS_TO_GENERATE[@]}"; do
+    TOKENS_TO_GENERATE[$i]=$(echo "${TOKENS_TO_GENERATE[$i]}" | xargs)
+done
+
+
+# Prepare folders for results
+mkdir -p results
+native_server_file="results/native_server.txt"
+vm_server_file="results/vm_server.txt"
+llm_os_file="results/llm_os.txt"
+:> $native_server_file 
+:> $vm_server_file 
+:> $llm_os_file 
+
+nb_prompts=`cat ${PROMPT_FILE[0]} | wc -l`
+echo "${PROMPT_FILE[0]} contains ${nb_prompts} prompts."
+run=1
+total_size=`expr 3 \* ${#MODELS[@]} \* ${nb_prompts} \* ${#TOKENS_TO_GENERATE[@]}`
+echo "Running all combinations:"
+for model in "${MODELS[@]}"; do
+	for prompt_nb in $(seq 1 ${nb_prompts}); do
+        for n_to_gen in "${TOKENS_TO_GENERATE[@]}"; do
+			
+		#	echo "Prompt nb: ${prompt_nb}"
+			# extract the `prompt_nb`th line from the file
+			prompt=`sed -n "${prompt_nb}p" ${PROMPT_FILE[0]}`
+
+			# Run the native llama-server
+			################################################
+			command_native="../../llama.cpp/llama-server --metrics -m ${model}"
+			echo "[$run/$total_size] Executing: $command_native"
+			run=`expr $run + 1`
+			eval $command_native &> /dev/null &
+			sleep 10
+			eval "curl --request POST --url http://localhost:8080/completion --data '{\"prompt\": \"${prompt}\",\"n_predict\": ${n_to_gen}}'"  &> /dev/null
+			echo "--------- NATIVE --------- " >> $native_server_file
+			eval "curl --request GET --url http://localhost:8080/metrics" >> $native_server_file
+			echo "model=$model" >> $native_server_file
+			echo "-------------------------- " >> $native_server_file
+			pkill -f "llama-server"
+			#################################################
+
+			# Run llama-server in VM
+			#################################################
+			command_vm=""	
+			echo "[$run/$total_size] Executing in VM: $command_native"
+			run=`expr $run + 1`
+			
+			# start VM
+			cd vm_scripts
+			nixos-shell server.nix &
+			PID=$!
+
+			# Connect to the VM and start the server
+			while ! ssh -o StrictHostKeyChecking=no -p 2345 root@localhost 'echo "Server VM live"'
+			do
+				sleep 3
+			done
+
+			ssh_command="/vm_scripts/throughput_single_vm.sh"
+			while ! ssh -o StrictHostKeyChecking=no -p 2345 root@localhost \"$ssh_command\" >> ../$vm_server_file
+			do
+				sleep 1
+			done
+			# TODO: How do I find out when to kill VM?
+			sleep 20
+			pkill -f "qemu-kvm"
+			exit 1
+			#kill $PID
+			cd ..
+		
+			#################################################
+			
+			# Run LLM-OS
+			#################################################
+            command_llmos="./script_helper -m \"$path\" -p \"$text\" -n \"$number\""
+			echo "[$run/$total_size] Executing: $command_llmos"
+			run=`expr $run + 1`
+           # eval $command  # Execute the constructed command using eval
+			#################################################
+        done
+    done
+done
+
+echo "All combinations completed."
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/throughput/vm_scripts/server.nix b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/throughput/vm_scripts/server.nix
new file mode 100644
index 000000000..6cf3ff43f
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/throughput/vm_scripts/server.nix
@@ -0,0 +1,74 @@
+{ pkgs, config, ... }:
+{
+  nixpkgs.config.allowUnfree = true;
+  nixpkgs.config.nvidia.acceptLicense = true;
+  # enable the nvidia driver
+  services.xserver.videoDrivers = [ "nvidia" ];
+  hardware.opengl.enable = true;
+  hardware.nvidia.datacenter.enable = true;
+  hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.dc_535;
+
+  hardware.nvidia.open = true;
+
+  virtualisation.docker.enable = true;
+  hardware.nvidia-container-toolkit.enable = true;
+  hardware.opengl.driSupport32Bit = true;
+
+  virtualisation.memorySize = 64 * 1024;
+  virtualisation.diskSize = 128 * 1024;
+  # Nix Store must be persistent across all QEMU
+  # executions
+  virtualisation.writableStoreUseTmpfs = false;
+
+  services.openssh.enable = true;
+  virtualisation.forwardPorts = [
+      { from = "host"; host.port = 2345; guest.port = 22; }
+    ];
+
+  virtualisation.qemu.options = [
+    #"-name NIXVM,debug-threads=on"
+    "-enable-kvm"
+    "-cpu host"
+#    "-device vfio-pci,host=ca:00.0"
+    "-device ivshmem-plain,memdev=shmBerkay,bus=pci.0,addr=0x12,master=on"
+    "-object memory-backend-file,size=32M,share=on,mem-path=/dev/shm/shmBerkay,id=shmBerkay"
+    "-smp 32,sockets=1,cores=32,threads=1,maxcpus=32"
+    "-m 64G"
+  ];
+
+  nixos-shell.mounts.extraMounts = {
+    # override options for each mount
+    "/results" = {
+      target = ../results;
+      cache = "none";
+    };
+    "/vm_scripts" = {
+      target = ./.;
+      cache = "none";
+    };
+    "/models" = {
+	  target=../../../../models;
+      cache = "none";
+    };
+  };
+  nixos-shell.mounts = {
+    mountHome = false;
+    mountNixProfile = false;
+    cache = "none"; # default is "loose"
+  };
+
+
+  environment.systemPackages = [
+  	config.boot.kernelPackages.nvidiaPackages.dc_535
+#	config.boot.kernelPackages.nvidia_x11
+	pkgs.git
+	pkgs.phoronix-test-suite
+	pkgs.neovim
+	pkgs.cudaPackages.cuda_cudart.stubs
+	pkgs.cudatoolkit
+	pkgs.pciutils
+	pkgs.openssh
+	];
+}
+
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/throughput/vm_scripts/throughput_single_vm.sh b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/throughput/vm_scripts/throughput_single_vm.sh
new file mode 100755
index 000000000..db987daff
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/throughput/vm_scripts/throughput_single_vm.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+echo "Script running in VM!"
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_client/client.nix b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_client/client.nix
new file mode 100644
index 000000000..7ea67ce85
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_client/client.nix
@@ -0,0 +1,73 @@
+{ pkgs, config, ... }:
+{
+  nixpkgs.config.allowUnfree = true;
+  nixpkgs.config.nvidia.acceptLicense = true;
+  # enable the nvidia driver
+  services.xserver.videoDrivers = [ "nvidia" ];
+  hardware.opengl.enable = true;
+  hardware.nvidia.datacenter.enable = true;
+  hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.dc_535;
+
+  hardware.nvidia.open = true;
+
+  virtualisation.docker.enable = true;
+  hardware.nvidia-container-toolkit.enable = true;
+  hardware.opengl.driSupport32Bit = true;
+
+  virtualisation.memorySize = 64 * 1024;
+  virtualisation.diskSize = 128 * 1024;
+  # Nix Store must be persistent across all QEMU
+  # executions
+  virtualisation.writableStoreUseTmpfs = false;
+  services.openssh.enable = true;
+  virtualisation.forwardPorts = [
+      { from = "host"; host.port = 2346; guest.port = 22; }
+    ];
+
+  virtualisation.qemu.options = [
+    #"-name NIXVM,debug-threads=on"
+    "-enable-kvm"
+    "-cpu host"
+    "-smp 16,sockets=1,cores=16,threads=1,maxcpus=16"
+    "-device ivshmem-plain,memdev=shmBerkay,bus=pci.0,addr=0x12,master=on"
+    "-object memory-backend-file,size=32M,share=on,mem-path=/dev/shm/shmBerkay,id=shmBerkay"
+    "-m 64G"
+  ];
+
+  nixos-shell.mounts.extraMounts = {
+    # override options for each mount
+    "/llm-os" = {
+      target = ../../.;
+      cache = "none";
+    };
+    "/results" = {
+      target = ../results;
+      cache = "none";
+    };
+    "/vm_scripts" = {
+      target = ../vm_scripts;
+      cache = "none";
+    };
+    "/server_signals" = {
+      target = ../server_signals;
+      cache = "none";
+    };
+  };
+  nixos-shell.mounts = {
+    mountHome = false;
+    mountNixProfile = false;
+    cache = "none"; # default is "loose"
+  };
+
+
+  environment.systemPackages = [
+	pkgs.git
+	pkgs.neovim
+	pkgs.fish
+	pkgs.stow
+	pkgs.tmux
+	pkgs.phoronix-test-suite
+	];
+}
+
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/client.sh b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/client.sh
new file mode 100644
index 000000000..41e2e8870
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/client.sh
@@ -0,0 +1,14 @@
+set -x
+
+cd ~
+
+mkdir -p llm-os
+rsync --progress --whole-file -a /llm-os ./ --exclude /llm-os/benchmarks/models --exclude /llm-os/loras --exclude /llm-os/benchmarks/vm_server --exclude /llm-os/benchmarks/vm_client --exclude /llm-os/.git
+cd llm-os
+
+# run the benchmark
+export NIXPKGS_ALLOW_UNFREE=1
+nix-shell /vm_scripts/vm-client-shell.nix
+
+shutdown now
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/client_nix.sh b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/client_nix.sh
new file mode 100644
index 000000000..ac1f41e6b
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/client_nix.sh
@@ -0,0 +1,579 @@
+set -x
+
+wait_for_server () {
+  until [ -f "/server_signals/test$1" ]
+  do
+     sleep 3
+  done
+}
+
+export FORCE_TIMES_TO_RUN=5 # Force Phoronix to repeat the same test specific amount of times
+TEST_LENGTH=600
+
+# compile client
+cd intervm_comm
+make clean
+make client
+
+# TEST 2.1 GPU: server <-> client (5 minutes)
+wait_for_server "2_1_gpu"
+./client -i 1 -p 1 -s 0 &
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.2 GPU: server <-> 2 * client
+wait_for_server "2_2_gpu"
+for i in $(seq 1 2); do
+    ./client -i "$i" -p 1 -s 0 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.3: server <-> 4 * client
+wait_for_server "3"
+for i in $(seq 1 4); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.4: server <-> 8 * client
+wait_for_server "4"
+for i in $(seq 1 8); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.5: server <-> 16 * client
+wait_for_server "5"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+# TEST 2.5.32: server <-> 32 * client
+wait_for_server "5_32"
+for i in $(seq 1 32); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+# TEST 2.5_64: server <-> 64 * client
+wait_for_server "5_64"
+for i in $(seq 1 64); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+# TEST 2.5.1: server <-> 16 * client, 50 n_token
+wait_for_server "5_1"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.5.2: server <-> 16 * client, 100 n_token
+wait_for_server "5_2"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.5.3: server <-> 16 * client, 20 n_token
+wait_for_server "5_3"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.5.4: server <-> 16 * client, 30 n_token
+wait_for_server "5_4"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.5.3: server <-> 16 * client, 40 n_token
+wait_for_server "5_5"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.6: server <-> 2 * client
+wait_for_server "6"
+for i in $(seq 1 2); do
+    ./client -i "$i" -p "$i" &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.7: server <-> 4 * client
+wait_for_server "7"
+for i in $(seq 1 4); do
+    ./client -i "$i" -p "$i" &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.8: server <-> 8 * client
+wait_for_server "8"
+for i in $(seq 1 8); do
+    ./client -i "$i" -p "$i" &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.9: server <-> 16 * client
+wait_for_server "9"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p "$i" &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+sleep 5
+
+# TEST 2.10: server <-> client + phoronix
+printf 'y\nn\nn\nn\nn\nn\nn\n' | phoronix-test-suite batch-setup
+phoronix-test-suite batch-install pts/openssl
+
+export TEST_RESULTS_NAME=gpu-phoronix-result-openssl-vm-1-client
+export OUTPUT_DIR=/results
+
+wait_for_server "2_10_gpu"
+./client -i 1 -p 1 -s 0 &
+
+# start the benchmark
+# 2 defines the encryption
+printf '2\n' | phoronix-test-suite batch-run pts/openssl
+
+# after the benchmark send stop request
+./client -w
+
+# kill the clients
+pkill client
+
+# save the result to shared directory
+phoronix-test-suite result-file-to-csv $TEST_RESULTS_NAME
+
+# TEST 2.11.1: server (throughput limit) <-> client + phoronux
+export TEST_RESULTS_NAME=gpu-phoronix-result-openssl-vm-1-client-throttled-10
+export OUTPUT_DIR=/results
+
+wait_for_server "2_11_1_gpu"
+./client -i 1 -p 1 -s 0 &
+
+# start the benchmark
+# 2 defines the encryption
+printf '2\n' | phoronix-test-suite batch-run pts/openssl
+
+# after the benchmark send stop request
+./client -w
+
+# kill the clients
+pkill client
+
+# save the result to shared directory
+phoronix-test-suite result-file-to-csv $TEST_RESULTS_NAME
+# TEST 2.11.2: server (throughput limit) <-> client + phoronux
+export TEST_RESULTS_NAME=gpu-phoronix-result-openssl-vm-1-client-throttled-20
+export OUTPUT_DIR=/results
+
+wait_for_server "2_11_2_gpu"
+./client -i 1 -p 1 -s 0 &
+
+# start the benchmark
+# 2 defines the encryption
+printf '2\n' | phoronix-test-suite batch-run pts/openssl
+
+# after the benchmark send stop request
+./client -w
+
+# kill the clients
+pkill client
+
+# save the result to shared directory
+phoronix-test-suite result-file-to-csv $TEST_RESULTS_NAME
+# TEST 2.11.3: server (throughput limit) <-> client + phoronux
+export TEST_RESULTS_NAME=gpu-phoronix-result-openssl-vm-1-client-throttled-30
+export OUTPUT_DIR=/results
+
+wait_for_server "2_11_3_gpu"
+./client -i 1 -p 1 -s 0 &
+
+# start the benchmark
+# 2 defines the encryption
+printf '2\n' | phoronix-test-suite batch-run pts/openssl
+
+# after the benchmark send stop request
+./client -w
+
+# kill the clients
+pkill client
+
+# save the result to shared directory
+phoronix-test-suite result-file-to-csv $TEST_RESULTS_NAME
+# TEST 2.11.4: server (throughput limit) <-> client + phoronux
+export TEST_RESULTS_NAME=gpu-phoronix-result-openssl-vm-1-client-throttled-40
+export OUTPUT_DIR=/results
+
+wait_for_server "2_11_4_gpu"
+./client -i 1 -p 1 -s 0 &
+
+# start the benchmark
+# 2 defines the encryption
+printf '2\n' | phoronix-test-suite batch-run pts/openssl
+
+# after the benchmark send stop request
+./client -w
+
+# kill the clients
+pkill client
+
+# save the result to shared directory
+phoronix-test-suite result-file-to-csv $TEST_RESULTS_NAME
+
+# TEST 2.12: none <-> phoronix
+export TEST_RESULTS_NAME=phoronix-result-vm-only
+export OUTPUT_DIR=/results
+
+# start the benchmark
+# 2 defines the encryption
+printf '2\n' | phoronix-test-suite batch-run pts/openssl
+
+# save the result to shared directory
+phoronix-test-suite result-file-to-csv $TEST_RESULTS_NAME
+touch /server_signals/test2_12_gpu_complete
+
+# TEST 2.13 GPU: llama-bench gpu
+# runs on server
+
+# TEST 2.1 CPU: server <-> client (5 minutes)
+wait_for_server "2_1_cpu"
+./client -i 1 -p 1 -s 0 &
+
+./client -w
+pkill client
+
+# TEST 2.2 cPU: server <-> 2 * client
+wait_for_server "2_2_cpu"
+parallel ./client -p 1 -s 0 -i ::: {1..2}
+
+./client -w
+pkill client
+
+# TEST 2.3 CPU: server <-> 4 * client
+wait_for_server "3"
+parallel ./client -p 1 -s 0 -i ::: {1..4}
+
+./client -w
+pkill client
+
+# TEST 2.4: server <-> 8 * client
+wait_for_server "4"
+parallel ./client -p 1 -s 0 -i ::: {1..8}
+
+./client -w
+pkill client
+
+# TEST 2.5: server <-> 16 * client
+wait_for_server "5"
+parallel ./client -p 1 -s 0 -i ::: {1..16}
+
+./client -w
+pkill client
+# TEST 2.5.32: server <-> 32 * client
+wait_for_server "5_20m"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p 1 > ~/client"$i" &
+    sleep 1
+done
+
+sleep 1200
+./client -w
+pkill client
+# TEST 2.5_64: server <-> 64 * client
+wait_for_server "5_30m"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep 1800
+./client -w
+pkill client
+# TEST 2.5_64: server <-> 64 * client
+wait_for_server "5_40m"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep 2400
+./client -w
+pkill client
+# TEST 2.5_64: server <-> 64 * client
+wait_for_server "5_50m"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep 3000
+./client -w
+pkill client
+# TEST 2.5.1: server <-> 16 * client 50
+wait_for_server "5_1"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.5.2: server <-> 16 * client 100
+wait_for_server "5_2"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.5.3: server <-> 16 * client, 20 n_token
+wait_for_server "5_3"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.5.4: server <-> 16 * client, 30 n_token
+wait_for_server "5_4"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.5.3: server <-> 16 * client, 40 n_token
+wait_for_server "5_5"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p 1 &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.6: server <-> 2 * client
+wait_for_server "6"
+for i in $(seq 1 2); do
+    ./client -i "$i" -p "$i" &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.7: server <-> 4 * client
+wait_for_server "7"
+for i in $(seq 1 4); do
+    ./client -i "$i" -p "$i" &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.8: server <-> 8 * client
+wait_for_server "8"
+for i in $(seq 1 8); do
+    ./client -i "$i" -p "$i" &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+
+# TEST 2.9: server <-> 16 * client
+wait_for_server "9"
+for i in $(seq 1 16); do
+    ./client -i "$i" -p "$i" &
+    sleep 1
+done
+
+sleep $TEST_LENGTH
+./client -w
+pkill client
+sleep 5
+
+# TEST 2.10 CPU: server <-> client + phoronix
+printf 'y\nn\nn\nn\nn\nn\nn\n' | phoronix-test-suite batch-setup
+phoronix-test-suite batch-install pts/openssl
+
+export TEST_RESULTS_NAME=cpu-phoronix-result-openssl-vm-1-client
+export OUTPUT_DIR=/results
+
+wait_for_server "2_10_cpu"
+./client -i 1 -p 1 -s 0 &
+
+# start the benchmark
+# 2 defines the encryption
+printf '2\n' | phoronix-test-suite batch-run pts/openssl
+
+# after the benchmark send stop request
+./client -w
+
+# kill the clients
+pkill client
+
+# save the result to shared directory
+phoronix-test-suite result-file-to-csv $TEST_RESULTS_NAME
+
+# TEST 2.11.1 CPU: server (throughput limit) <-> client + phoronux
+export TEST_RESULTS_NAME=cpu-phoronix-result-openssl-vm-1-client-throttled-05
+export OUTPUT_DIR=/results
+
+wait_for_server "2_11_1_cpu"
+./client -i 1 -p 1 -s 0 &
+
+# start the benchmark
+# 2 defines the encryption
+printf '2\n' | phoronix-test-suite batch-run pts/openssl
+
+# after the benchmark send stop request
+./client -w
+
+# kill the clients
+pkill client
+
+# save the result to shared directory
+phoronix-test-suite result-file-to-csv $TEST_RESULTS_NAME
+# TEST 2.11.2 CPU: server (throughput limit) <-> client + phoronux
+export TEST_RESULTS_NAME=cpu-phoronix-result-openssl-vm-1-client-throttled-1
+export OUTPUT_DIR=/results
+
+wait_for_server "2_11_2_cpu"
+./client -i 1 -p 1 -s 0 &
+
+# start the benchmark
+# 2 defines the encryption
+printf '2\n' | phoronix-test-suite batch-run pts/openssl
+
+# after the benchmark send stop request
+./client -w
+
+# kill the clients
+pkill client
+
+# save the result to shared directory
+phoronix-test-suite result-file-to-csv $TEST_RESULTS_NAME
+# TEST 2.11.3 CPU: server (throughput limit) <-> client + phoronux
+export TEST_RESULTS_NAME=cpu-phoronix-result-openssl-vm-1-client-throttled-15
+export OUTPUT_DIR=/results
+
+wait_for_server "2_11_3_cpu"
+./client -i 1 -p 1 -s 0 &
+
+# start the benchmark
+# 2 defines the encryption
+printf '2\n' | phoronix-test-suite batch-run pts/openssl
+
+# after the benchmark send stop request
+./client -w
+
+# kill the clients
+pkill client
+
+# save the result to shared directory
+phoronix-test-suite result-file-to-csv $TEST_RESULTS_NAME
+# TEST 2.11.4 CPU: server (throughput limit) <-> client + phoronux
+export TEST_RESULTS_NAME=cpu-phoronix-result-openssl-vm-1-client-throttled-2
+export OUTPUT_DIR=/results
+
+wait_for_server "2_11_4_cpu"
+./client -i 1 -p 1 -s 0 &
+
+# start the benchmark
+# 2 defines the encryption
+printf '2\n' | phoronix-test-suite batch-run pts/openssl
+
+# after the benchmark send stop request
+./client -w
+
+# kill the clients
+pkill client
+
+# save the result to shared directory
+phoronix-test-suite result-file-to-csv $TEST_RESULTS_NAME
+
+exit
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/server.sh b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/server.sh
new file mode 100644
index 000000000..c5a4ae2e0
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/server.sh
@@ -0,0 +1,26 @@
+set -x
+
+cd ~
+
+mkdir -p llm-os
+rsync --progress --whole-file -a /llm-os ./ --exclude /llm-os/benchmarks/models --exclude /llm-os/loras --exclude /llm-os/benchmarks/vm_server --exclude /llm-os/benchmarks/vm_client --exclude /llm-os/.git
+cd llm-os
+
+# copy model
+mkdir -p models
+if [ ! -f "./models/model_file" ] ; then
+    cp /models/model_file "./models/model_file"
+fi
+
+# cleanup
+rm /server_signals/*
+
+export NIXPKGS_ALLOW_UNFREE=1
+echo "Entering nix env"
+nix-shell /vm_scripts/vm-server-shell.nix
+
+# cleanup
+rm /server_signals/*
+
+#shutdown now
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/server_nix.sh b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/server_nix.sh
new file mode 100644
index 000000000..6f57a2783
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/server_nix.sh
@@ -0,0 +1,168 @@
+set -x
+
+source /llm-os/benchmarks/config.sh
+export N_THREADS=16
+export OUTPUT_DIR=/results
+
+wait_for_server () {
+  until [ -f "/server_signals/test$1" ]
+  do
+     sleep 3
+  done
+}
+
+# compile llama.cpp
+export GGML_CUDA=1
+cd llama.cpp
+make clean
+make libllama.so -j8
+make libcommon.a -j8
+make libggml.so -j8
+
+# compile server
+cd ../intervm_comm
+make clean
+make server
+
+# TEST 2.1: server <-> client (5 minutes)
+./server -m ../models/model_file -p /results/gpu_1_client_server -f /server_signals/test2_1_gpu --n_threads=$N_THREADS
+
+# TEST 2.2: server <-> 2 * client
+./server -m ../models/model_file -p /results/gpu_2_client_server -f /server_signals/test2_2_gpu --n_threads=$N_THREADS
+
+# TEST 2.3: server <-> 4 * client
+./server -m ../models/model_file -p /results/gpu_4_client_server -f /server_signals/test3 --n_threads=$N_THREADS
+
+# TEST 2.4: server <-> 8 * client
+./server -m ../models/model_file -p /results/gpu_8_client_server -f /server_signals/test4 --n_threads=$N_THREADS
+
+# TEST 2.5: server <-> 16 * client
+./server -m ../models/model_file -p /results/gpu_16_client_server -f /server_signals/test5 --n_threads=$N_THREADS
+# TEST 2.5.32: server <-> 32 * client
+./server -m ../models/model_file -p /results/gpu_32_client_server -f /server_signals/test5_32 --n_threads=$N_THREADS
+# TEST 2.5.64: server <-> 64 * client
+./server -m ../models/model_file -p /results/gpu_64_client_server -f /server_signals/test5_64 --n_threads=$N_THREADS
+# TEST 2.5.1: server <-> 16 * client n_token 50
+./server -m ../models/model_file -p /results/gpu_16_client_server_50_tokens -f /server_signals/test5_1 --n_threads=$N_THREADS --tokens-to-gen=50
+# TEST 2.5.2: server <-> 16 * client n_token 100
+./server -m ../models/model_file -p /results/gpu_16_client_server_100_tokens -f /server_signals/test5_2 --n_threads=$N_THREADS --tokens-to-gen=100
+# TEST 2.5.3: server <-> 16 * client n_token 20
+./server -m ../models/model_file -p /results/gpu_16_client_server_20_tokens -f /server_signals/test5_3 --n_threads=$N_THREADS --tokens-to-gen=20
+# TEST 2.5.4: server <-> 16 * client n_token 30
+./server -m ../models/model_file -p /results/gpu_16_client_server_30_tokens -f /server_signals/test5_4 --n_threads=$N_THREADS --tokens-to-gen=30
+# TEST 2.5.5: server <-> 16 * client n_token 40
+./server -m ../models/model_file -p /results/gpu_16_client_server_40_tokens -f /server_signals/test5_5 --n_threads=$N_THREADS --tokens-to-gen=40
+
+# TEST 2.6: server <-> 2 * client (diff prio)
+./server -m ../models/model_file -p /results/gpu_2_client_diff_prio_server -f /server_signals/test6 --n_threads=$N_THREADS
+
+# TEST 2.7: server <-> 4 * client (diff prio)
+./server -m ../models/model_file -p /results/gpu_4_client_diff_prio_server -f /server_signals/test7 --n_threads=$N_THREADS
+
+# TEST 2.8: server <-> 8 * client (diff prio)
+./server -m ../models/model_file -p /results/gpu_8_client_diff_prio_server -f /server_signals/test8 --n_threads=$N_THREADS
+
+# TEST 2.9: server <-> 16 * client (diff prio)
+./server -m ../models/model_file -p /results/gpu_16_client_diff_prio_server -f /server_signals/test9 --n_threads=$N_THREADS
+# TEST 2.10: server <-> client + phoronix
+./server -m ../models/model_file -p /results/gpu_openssl_1_client_server -f /server_signals/test2_10_gpu --n_threads=$N_THREADS
+
+# TEST 2.11.1: server (throughput limit) <-> client + phoronix
+./server -m ../models/model_file -p /results/gpu_openssl_1_client_server_throtthle_10 -f /server_signals/test2_11_1_gpu --throughput-limit=10 --n_threads=$N_THREADS
+# TEST 2.11.2: server (throughput limit) <-> client + phoronix
+./server -m ../models/model_file -p /results/gpu_openssl_1_client_server_throtthle_20 -f /server_signals/test2_11_2_gpu --throughput-limit=20 --n_threads=$N_THREADS
+# TEST 2.11.3: server (throughput limit) <-> client + phoronix
+./server -m ../models/model_file -p /results/gpu_openssl_1_client_server_throtthle_30 -f /server_signals/test2_11_3_gpu --throughput-limit=30 --n_threads=$N_THREADS
+# TEST 2.11.4: server (throughput limit) <-> client + phoronix
+./server -m ../models/model_file -p /results/gpu_openssl_1_client_server_throtthle_40 -f /server_signals/test2_11_4_gpu --throughput-limit=40 --n_threads=$N_THREADS
+
+# TEST 2.12: client side
+wait_for_server "2_12_gpu_complete"
+
+# TEST 2.13 GPU: llama-bench gpu
+cd ../llama.cpp
+make llama-bench
+./llama-bench -m ../models/model_file -n 64,128,256,512 -ngl 999 --threads $N_THREADS -o json --progress > $OUTPUT_DIR/llama_bench_Q4_K_M_gpu_vm.json
+
+# compile llama.cpp
+unset GGML_CUDA
+cd ../llama.cpp
+make clean
+make libllama.so -j8
+make libcommon.a -j8
+make libggml.so -j8
+
+# compile server
+cd ../intervm_comm
+make clean
+make server
+
+# TEST 2.1: server <-> client (5 minutes)
+./server -m ../models/model_file -p /results/cpu_1_client_server_zero_sleep -f /server_signals/test2_1_cpu --n_threads=$N_THREADS
+
+# TEST 2.2: server <-> 2 * client
+./server -m ../models/model_file -p /results/cpu_2_client_server_zero_sleep -f /server_signals/test2_2_cpu --n_threads=$N_THREADS
+
+# TEST 2.3: server <-> 4 * client
+./server -m ../models/model_file -p /results/4_client_server_zero_sleep -f /server_signals/test3 --n_threads=$N_THREADS
+
+# TEST 2.4: server <-> 8 * client
+./server -m ../models/model_file -p /results/8_client_server_zero_sleep -f /server_signals/test4 --n_threads=$N_THREADS
+
+# TEST 2.5: server <-> 16 * client
+./server -m ../models/model_file -p /results/16_client_server_zero_sleep -f /server_signals/test5 --n_threads=$N_THREADS
+
+# TEST 2.5: server <-> 16 * client
+./server -m ../models/model_file -p /results/16_client_server_20_minutes -f /server_signals/test5_20m --n_threads=$N_THREADS
+# TEST 2.5: server <-> 16 * client
+./server -m ../models/model_file -p /results/16_client_server_30_minutes -f /server_signals/test5_30m --n_threads=$N_THREADS
+# TEST 2.5: server <-> 16 * client
+./server -m ../models/model_file -p /results/16_client_server_40_minutes -f /server_signals/test5_40m --n_threads=$N_THREADS
+# TEST 2.5: server <-> 16 * client
+./server -m ../models/model_file -p /results/16_client_server_50_minutes -f /server_signals/test5_50m --n_threads=$N_THREADS
+
+# TEST 2.5.1: server <-> 16 * client n_token 50
+./server -m ../models/model_file -p /results/16_client_server_50_tokens -f /server_signals/test5_1 --n_threads=$N_THREADS --tokens-to-gen=50
+# TEST 2.5.2: server <-> 16 * client n_token 100
+./server -m ../models/model_file -p /results/16_client_server_100_tokens -f /server_signals/test5_2 --n_threads=$N_THREADS --tokens-to-gen=100
+# TEST 2.5.3: server <-> 16 * client n_token 20
+./server -m ../models/model_file -p /results/16_client_server_20_tokens -f /server_signals/test5_3 --n_threads=$N_THREADS --tokens-to-gen=20
+# TEST 2.5.4: server <-> 16 * client n_token 30
+./server -m ../models/model_file -p /results/16_client_server_30_tokens -f /server_signals/test5_4 --n_threads=$N_THREADS --tokens-to-gen=30
+# TEST 2.5.5: server <-> 16 * client n_token 40
+./server -m ../models/model_file -p /results/16_client_server_40_tokens -f /server_signals/test5_5 --n_threads=$N_THREADS --tokens-to-gen=40
+
+# TEST 2.6: server <-> 2 * client (diff prio)
+./server -m ../models/model_file -p /results/2_client_diff_prio_server -f /server_signals/test6 --n_threads=$N_THREADS
+
+# TEST 2.7: server <-> 4 * client (diff prio)
+./server -m ../models/model_file -p /results/4_client_diff_prio_server -f /server_signals/test7 --n_threads=$N_THREADS
+
+# TEST 2.8: server <-> 8 * client (diff prio)
+./server -m ../models/model_file -p /results/8_client_diff_prio_server -f /server_signals/test8 --n_threads=$N_THREADS
+
+# TEST 2.9: server <-> 16 * client (diff prio)
+./server -m ../models/model_file -p /results/16_client_diff_prio_server -f /server_signals/test9 --n_threads=$N_THREADS
+
+# TEST 2.10: server <-> client + phoronix
+./server -m ../models/model_file -p /results/cpu_openssl_1_client_server -f /server_signals/test2_10_cpu --n_threads=$N_THREADS
+
+# TEST 2.11.1: server (throughput limit) <-> client + phoronix
+./server -m ../models/model_file -p /results/cpu_openssl_1_client_server_throtthle -f /server_signals/test2_11_1_cpu --throughput-limit=1 --n_threads=$N_THREADS
+# TEST 2.11.2: server (throughput limit) <-> client + phoronix
+./server -m ../models/model_file -p /results/cpu_openssl_1_client_server_throtthle -f /server_signals/test2_11_2_cpu --throughput-limit=2 --n_threads=$N_THREADS
+# TEST 2.11.1: server (throughput limit) <-> client + phoronix
+./server -m ../models/model_file -p /results/cpu_openssl_1_client_server_throtthle -f /server_signals/test2_11_3_cpu --throughput-limit=3 --n_threads=$N_THREADS
+# TEST 2.11.1: server (throughput limit) <-> client + phoronix
+./server -m ../models/model_file -p /results/cpu_openssl_1_client_server_throtthle -f /server_signals/test2_11_4_cpu --throughput-limit=4 --n_threads=$N_THREADS
+
+# TEST 2.12: none <-> phoronix
+# this is alreday executed
+
+# TEST 2.13 CPU: llama-bench
+cd ../llama.cpp
+make llama-bench
+./llama-bench -m ../models/model_file -p 0 -n 64,128,256,512 -ngl 0 --threads $N_THREADS -o json --progress > $OUTPUT_DIR/llama_bench_Q4_K_M_cpu_vm.json
+
+exit
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/vm-client-shell.nix b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/vm-client-shell.nix
new file mode 100644
index 000000000..be4456318
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/vm-client-shell.nix
@@ -0,0 +1,32 @@
+#{ pkgs ? import /ssd_extension/teofil/nixpkgs {} }:
+#{ pkgs ? import <nixpkgs> {cudaSupport = true;} }:
+#{pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/8b27c1239e5c421a2bbc2c65d52e4a6fbf2ff296.tar.gz") {} }:   
+{pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/63dacb46bf939521bdc93981b4cbb7ecb58427a0.tar.gz") {} }:   
+
+(pkgs.buildFHSUserEnv {
+ name = "cudazone";
+ targetPkgs = pkgs: (with pkgs; [
+		 gcc12
+		 gdb
+		 ccache
+#		 cudaPackages.nsight_systems
+#		 cudaPackages.cuda_nvcc
+		 cudatoolkit
+		 cudaPackages.cuda_cudart.stubs
+		 git-lfs
+		 glibc.dev
+		 gnumake
+		 cmake
+		 man
+		 go
+		 unzip
+		 curl.dev
+		 phoronix-test-suite
+		 python3
+		 php
+		 nixos-shell
+		 fzf
+ ]);
+ runScript = "bash /vm_scripts/client_nix.sh";
+ }).env
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/vm-server-shell.nix b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/vm-server-shell.nix
new file mode 100644
index 000000000..fbed458d3
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_scripts/vm-server-shell.nix
@@ -0,0 +1,32 @@
+#{ pkgs ? import /ssd_extension/teofil/nixpkgs {} }:
+#{ pkgs ? import <nixpkgs> {cudaSupport = true;} }:
+#{pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/8b27c1239e5c421a2bbc2c65d52e4a6fbf2ff296.tar.gz") {} }:   
+{pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/63dacb46bf939521bdc93981b4cbb7ecb58427a0.tar.gz") {} }:   
+
+(pkgs.buildFHSUserEnv {
+ name = "cudazone";
+ targetPkgs = pkgs: (with pkgs; [
+		 gcc12
+		 gdb
+		 ccache
+#		 cudaPackages.nsight_systems
+#		 cudaPackages.cuda_nvcc
+		 cudatoolkit
+		 cudaPackages.cuda_cudart.stubs
+		 git-lfs
+		 glibc.dev
+		 gnumake
+		 cmake
+		 man
+		 go
+		 unzip
+		 curl.dev
+		 phoronix-test-suite
+		 python3
+		 php
+		 nixos-shell
+		 fzf
+ ]);
+runScript = "bash /vm_scripts/server_nix.sh";
+ }).env
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_server/server.nix b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_server/server.nix
new file mode 100644
index 000000000..2a07b8e35
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/benchmarks/vm_server/server.nix
@@ -0,0 +1,82 @@
+{ pkgs, config, ... }:
+{
+  nixpkgs.config.allowUnfree = true;
+  nixpkgs.config.nvidia.acceptLicense = true;
+  # enable the nvidia driver
+  services.xserver.videoDrivers = [ "nvidia" ];
+  hardware.opengl.enable = true;
+  hardware.nvidia.datacenter.enable = true;
+  hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.dc_535;
+
+  hardware.nvidia.open = true;
+
+  virtualisation.docker.enable = true;
+  hardware.nvidia-container-toolkit.enable = true;
+  hardware.opengl.driSupport32Bit = true;
+
+  virtualisation.memorySize = 64 * 1024;
+  virtualisation.diskSize = 128 * 1024;
+  # Nix Store must be persistent across all QEMU
+  # executions
+  virtualisation.writableStoreUseTmpfs = false;
+
+  services.openssh.enable = true;
+  virtualisation.forwardPorts = [
+      { from = "host"; host.port = 2345; guest.port = 22; }
+    ];
+
+  virtualisation.qemu.options = [
+    #"-name NIXVM,debug-threads=on"
+    "-enable-kvm"
+    "-cpu host"
+#    "-device vfio-pci,host=ca:00.0"
+    "-device ivshmem-plain,memdev=shmBerkay,bus=pci.0,addr=0x12,master=on"
+    "-object memory-backend-file,size=32M,share=on,mem-path=/dev/shm/shmBerkay,id=shmBerkay"
+    "-smp 32,sockets=1,cores=32,threads=1,maxcpus=32"
+    "-m 64G"
+  ];
+
+  nixos-shell.mounts.extraMounts = {
+    # override options for each mount
+    "/llm-os" = {
+      target = ../../.;
+      cache = "none";
+    };
+    "/results" = {
+      target = ../results;
+      cache = "none";
+    };
+    "/vm_scripts" = {
+      target = ../vm_scripts;
+      cache = "none";
+    };
+    "/models" = {
+      target = ../models;
+      cache = "none";
+    };
+    "/server_signals" = {
+      target = ../server_signals;
+      cache = "none";
+    };
+  };
+  nixos-shell.mounts = {
+    mountHome = false;
+    mountNixProfile = false;
+    cache = "none"; # default is "loose"
+  };
+
+
+  environment.systemPackages = [
+  	config.boot.kernelPackages.nvidiaPackages.dc_535
+#	config.boot.kernelPackages.nvidia_x11
+	pkgs.git
+	pkgs.phoronix-test-suite
+	pkgs.neovim
+	pkgs.cudaPackages.cuda_cudart.stubs
+	pkgs.cudatoolkit
+	pkgs.pciutils
+	pkgs.openssh
+	];
+}
+
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/cuda-shell.nix b/archive/2025/summer/msc_berkay_eren_ueruen/cuda-shell.nix
new file mode 100644
index 000000000..e2ec91c8f
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/cuda-shell.nix
@@ -0,0 +1,32 @@
+#{ pkgs ? import /ssd_extension/teofil/nixpkgs {} }:
+#{ pkgs ? import <nixpkgs> {cudaSupport = true;} }:
+#{pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/8b27c1239e5c421a2bbc2c65d52e4a6fbf2ff296.tar.gz") {} }:   
+{pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/63dacb46bf939521bdc93981b4cbb7ecb58427a0.tar.gz") {} }:   
+
+(pkgs.buildFHSUserEnv {
+ name = "cudazone";
+ targetPkgs = pkgs: (with pkgs; [
+		 gcc12
+		 gdb
+		 ccache
+#		 cudaPackages.nsight_systems
+#		 cudaPackages.cuda_nvcc
+		 cudatoolkit
+		 cudaPackages.cuda_cudart.stubs
+		 git-lfs
+		 glibc.dev
+		 gnumake
+		 cmake
+		 man
+		 go
+		 unzip
+		 curl.dev
+		 phoronix-test-suite
+		 python3
+		 php
+		 nixos-shell
+		 fzf
+ ]);
+ runScript = "bash";
+ }).env
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/flake.nix b/archive/2025/summer/msc_berkay_eren_ueruen/flake.nix
new file mode 100644
index 000000000..71d1e199d
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/flake.nix
@@ -0,0 +1,78 @@
+{
+  description = "Development environment for this project";
+
+  inputs = {
+    nixpkgs.url = "git+https://github.com/TUM-DSE/nixpkgs.git?ref=nixos-24.11-backports&shallow=1";
+    #nixpkgs.url = "github:TUM-DSE/nixpkgs/63dacb46bf939521bdc93981b4cbb7ecb58427a0";
+    jetpack-nixos.url = "git+https://github.com/TUM-DSE/jetpack-nixos.git?shallow=1";
+    #jetpack-nixos.url = "git+https://github.com/TUM-DSE/jetpack-nixos.git?shallow=1&ref=final-stretch";
+    jetpack-nixos.inputs.nixpkgs.follows = "nixpkgs";
+    flake-parts.url = "github:hercules-ci/flake-parts";
+  };
+
+  outputs =
+    inputs@{ flake-parts, nixpkgs, ... }:
+    flake-parts.lib.mkFlake { inherit inputs; } (
+      { lib, ... }:
+      {
+        systems = lib.systems.flakeExposed;
+
+        # Usage:
+		# $ nixos-shell --flake .#nvidia-vms
+		flake.nixosConfigurations.nvidia-vm = nixpkgs.lib.nixosSystem {
+		  modules = [{
+		   imports = [
+		     ./nvidia-gpu-patrick-latests.nix
+		   ];
+		  }];
+		  specialArgs = {
+		    inputs =  inputs;
+		  };
+		};
+        perSystem =
+          {
+            inputs',
+            pkgs,
+            system,
+            ...
+          }:
+          {
+            _module.args.pkgs = import inputs.nixpkgs {
+              inherit system;
+              config = {
+                allowUnfree = true;
+                cudaSupport = true;
+                # Only for jetson devices: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
+                # Faster compilation time?
+                cudaCapabilities = [ "8.7" ];
+
+              };
+#              overlays = [ (final: prev: { cudaPackages = inputs'.jetpack-nixos.legacyPackages.cudaPackages; }) ];
+            overlays = [  ];
+            };
+            packages.default = pkgs.mkShell {
+			   buildInputs = [
+			   pkgs.expat.dev
+          ];
+              packages = [
+                pkgs.bashInteractive
+#                pkgs.python3Packages.torch
+                #pkgs.cudaPackages.cuda_nvcc
+				#pkgs.cudaPackages.cuda_gdb
+				pkgs.cudaPackages.cudatoolkit
+				pkgs.cudaPackages.cuda_cudart
+				pkgs.cudaPackages.cuda_cudart.stubs
+				pkgs.expat
+				pkgs.nixos-shell
+              ];
+              shellHook = ''
+                export CUDA_PATH=${pkgs.cudaPackages.cudatoolkit}
+              '';
+              LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [
+                "/run/opengl-driver"
+              ];
+            };
+          };
+      }
+    );
+}
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/Makefile b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/Makefile
new file mode 100644
index 000000000..e432c7991
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/Makefile
@@ -0,0 +1,50 @@
+# Directories
+LLAMA_CPP_DIR := ../llama.cpp
+LLAMA_CPP_INC := ../llama.cpp/include
+LLAMA_CPP_COMMON_INC := $(LLAMA_CPP_DIR)/common
+LLAMA_CPP_BENCH := $(LLAMA_CPP_DIR)/examples/llama_bench/llama_bench.cpp
+GGML_INC := $(LLAMA_CPP_DIR)/ggml/include
+LIB_DIR := $(PWD)/libs
+SRC_DIR = src
+SERVER_DIR = $(SRC_DIR)/server
+CLIENT_DIR = $(SRC_DIR)/client
+
+# Compiler
+CC = g++
+CFLAGS = -Wall -Wno-deprecated-declarations -g -Isrc/include -std=c++20 -O3
+LLAMA_FLAGS = -I$(LLAMA_CPP_INC) -I$(LLAMA_CPP_COMMON_INC) -I$(GGML_INC) 
+# Set the libary path for the runtime shared libraries
+LDFLAGS = -L$(LIB_DIR) -Wl,-rpath,$(LIB_DIR) -lllama -lggml -lcommon
+
+
+# Source files
+SERVER_SRC = $(SERVER_DIR)/llama_server.cpp src/shm.cpp
+CLIENT_SRC = $(CLIENT_DIR)/write.c src/shm.cpp
+
+# Output executables
+SERVER_OUT = server
+CLIENT_OUT = client
+
+# Targets
+all: llama.cpp $(SERVER_OUT) $(CLIENT_OUT)
+
+# Get llama.cpp libraries
+llama.cpp:
+	mkdir -p libs; \
+	find $(LLAMA_CPP_DIR) -name "lib*" -exec cp {} $(LIB_DIR) \;
+
+# Build server
+$(SERVER_OUT): $(SERVER_SRC) llama.cpp
+	$(CC) $(CFLAGS) $(LLAMA_FLAGS) -o $(SERVER_OUT) $(SERVER_SRC) libs/libcommon.a $(LDFLAGS) 
+
+# Build client
+$(CLIENT_OUT): $(CLIENT_SRC)
+	$(CC) $(CFLAGS) -o $(CLIENT_OUT) $(CLIENT_SRC)
+
+# Clean
+clean:
+	rm -f $(SERVER_OUT) $(CLIENT_OUT)
+
+# Phony targets
+.PHONY: all clean
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/README.md b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/README.md
new file mode 100644
index 000000000..8e9601ab6
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/README.md
@@ -0,0 +1,92 @@
+# Interprocess Communication
+This setup consists of two applications. The server, responsible of processing inference requests by making required llama.cpp calls and the client itself.
+
+## Shared Memory
+Inter-VM communication requires some preparation. First of all, the VMs must be started with some extra parameters. An example of this can be found in `benchmark/vm_server/server.nix` or `benchmark/vm_client/client.nix` files.
+
+`-device ivshmem-plain,memdev=shmTest,bus=pci.0,addr=0x12,master=on`
+
+This first parameters adds a shared memory device. `addr` field here can be anything, however, as it is used by the source code in `shm.h` file, it must be consistent with that.
+
+`-object memory-backend-file,size=32M,share=on,mem-path=/dev/shm/shmTest,id=shmTest`
+
+This second parameter specifies the backend required by the shared memory device. As long as the same name, in this case `shmTest`, is consistently used, only interesting parameter is the size, which can be set according to needs. 32M here is selected without any particular meaning.
+
+When a QEMU VM is started with theses parameters, it will have an access to the defined shared memory. When two QEMU VMs are started with the exact same shared memory parameters, both will have access to the same shared memory.
+
+The resulting memory is represented as a file. In the hypervisor, it can be found in `/dev/shm`. In VMs, its location depends on the `addr` parameter that we specified. However, it will be under the `/sys/bus/pci/devices/` directory. With our example parameters, the exact locations looks like this: `/sys/bus/pci/devices/0000:00:12.0/resource2`.
+
+In source code, this shared memory file must be opened and then `mmap`ed. An example can be found in `/intervm_comm/src/shm.cpp`.
+
+### Native Testing
+In case the inter-VM setup will be run natively without any VMs. It is possible to create a shared memory by simply running `touch /dev/shm/shmTest`. `shmTest` here can be any name imaginable. Similar to VM setup, `shm.h` file must be updated so that it will use this local shared memory file.
+
+### Troubleshooting
+In some cases, especially when the hypervisor mistakenly writes to shared memory. QEMU complaints about the shared memory while starting a VM. The best way to overcome any errors is simply deleting the shared memory file in `/dev/shm` folder. This is safe, as long as the data in the file is expendable, as the file will be automatically created again while launching a QEMU VM.
+
+## Compilation
+Intervm communication makes use of shared libraries generated by the llama.cpp compilation process. Therefore we first need to compile the libraries.
+
+### llama.cpp Library Compilation
+Using `cmake` directly generates libraries inside of `llama.cpp/build/bin`. 
+```
+# In the root of llama.cpp
+cmake -B build -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE=Debug
+cmake --build build
+```
+
+`make` on the other hand must be called with the following command, however with the latest version it is deprecated:
+```
+# In the root of llama.cpp
+make lib*
+```
+This will generate libraries in the root directory `llama.cpp`.
+
+If this fails to generate lib files, each of them should be generated manually
+```
+# In the root of llama.cpp
+make libggml.so
+make libggml.a
+make libcommon.so
+make libcommon.a
+make libllama.so
+make libllama.a
+```
+
+### Intervm Communication Compilation
+
+```
+nix-shell ../cuda-shell.nix
+make
+```
+As a result a server and a client binary will be created.
+
+## Usage
+A good way to test the setup is launching two VMs, one of them dedicated to be the server and the other to be the client. The instructions for that can be found on the root README.
+
+On the client VM side, by using `tmux` or `screen`, it is possible to create multiple panels to follow multiple clients at the same time.
+
+### Server
+Only mandatory option the server takes is the path to the model file. It is also possible to specify LoRA file(s).
+```
+Usage: server [OPTION...]
+LLM-OS Server
+
+  -l, --lora=FILE            Path to a LoRA file (Can be specified multiple
+                             times)
+  -m, --model=FILE           Path to the model file (Required)
+  -?, --help                 Give this help list
+      --usage                Give a short usage message
+```
+### Client
+Client requires an `id` to be specified. This `id` currently also selects the prompt to be send. `lora_number` and `prio` are optional extra arguments that default to 0 when not specified.
+```
+Usage: client [OPTION...]
+Client-side argument parser
+
+  -i, --id=ID                User ID (Required)
+  -n, --lora_number=NUM      Number of LoRA (Default = 0)
+  -p, --prio=PRIO            Priority level (Default = 0)
+  -?, --help                 Give this help list
+      --usage                Give a short usage message
+```
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/client/write.c b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/client/write.c
new file mode 100644
index 000000000..f6c75bf53
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/client/write.c
@@ -0,0 +1,210 @@
+#include <assert.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "args_client.h"
+#include "shm.h"
+
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+int id;
+
+struct Communication comm;
+
+void shutdown_server() {
+    while (sem_trywait(&comm.requestQueue->run) == -1) {
+    };
+}
+
+void write_to_shm(struct SharedMemory* shm, const char* text, int n_chars_to_gen, int index,
+                  int prio, enum Lora lora, float throughput) {
+    // TODO: this zero set is a safety net
+    // currently it protects the memory if a client dies without cleaning the memory
+    // A better way would be register a cleaner signal handler.
+    for (int i = 0; i < MAX_TEXT; i++) {
+        shm->requests[index].text[i] = 0;
+    }
+
+    printf("Writing to shm index: [%d]\n"
+           "-> Current string in cell [%d] (should be empty): %s\n"
+           "-> Writing to cell...\n",
+           index, index, shm->requests[index].text);
+
+    for (int i = 0; i < (int)strlen(text); i++) {
+        shm->requests[index].text[i] = text[i];
+    }
+
+    printf("-> Resulting string in cell: %s\n", shm->requests[index].text);
+
+    shm->requests[index].n_chars_to_gen = n_chars_to_gen;
+    shm->requests[index].id = id;
+    shm->requests[index].prio = prio;
+    shm->requests[index].lora = lora;
+    shm->requests[index].throughput = throughput;
+}
+
+void print_from_shm(struct SharedMemory* shm, int str_len, int n_gen, int free_request_slot_num) {
+    printf("\n---------- Answer ----------\n");
+    bool server_done = false;
+    for (int i = str_len; server_done == false && i < str_len + n_gen; i++) {
+        while (shm->requests[free_request_slot_num].text[i] == '\0') {
+            // Server might have generated an end of gen char before we reached the number of chars
+            // we want to generate and expect. So we check if the server is done
+            if (sem_trywait(&comm.requestQueue->requests[free_request_slot_num].clientNotifier) ==
+                0) {
+                server_done = true;
+                break;
+            }
+        }
+        printf("%c", shm->requests[free_request_slot_num].text[i]);
+        fflush(stdout);
+    }
+    printf("\n------- End of Answer -------\n");
+
+    // If the server is not done, we wait for it.
+    if (server_done == false) {
+        // sem_wait seems to cause unexpected SIGABRT (The futex facility returned an unexpected
+        // error code) This is also true on server side, thus the busy waiting
+        while (sem_trywait(&comm.requestQueue->requests[free_request_slot_num].clientNotifier) ==
+               -1) {
+        };
+    }
+
+    printf("\nRequest has been fully answered!\n");
+    fflush(stdout);
+}
+
+int find_free_shm_request() {
+    for (int i = 0; i < MAX_REQUESTS; i++) {
+        if (pthread_mutex_trylock(&comm.requestQueue->requests[i].mutex) == 0) {
+            printf("=============================\n"
+                   "Free slot found: %d\n",
+                   i);
+            fflush(stdout);
+            return i;
+        }
+    }
+    return -1;
+}
+
+void free_request(int num) {
+    printf("Freeing cell: %d\n", num);
+    printf("-> Current string in cell [%d]: %s\n", num, comm.requestQueue->requests[num].text);
+    for (int i = 0; i < MAX_TEXT; i++) {
+        comm.requestQueue->requests[num].text[i] = 0;
+    }
+    printf("-> After removal cell [%d] content (should be empty): %s\n", num,
+           comm.requestQueue->requests[num].text);
+
+    pthread_mutex_unlock(&comm.requestQueue->requests[num].mutex);
+}
+
+const char* prompts[] = {"Building a website",
+                         "The sky",
+                         "Albert Einstein was",
+                         "First rule of",
+                         "Reversible computing is",
+                         "In summer, trees are",
+                         "Sunken cost of",
+                         "In their first album, Pink Floyd",
+                         "Preparing for a presentation could be",
+                         "To prepare a good iskender"};
+
+const char* prompts2[] = {
+    "Best kebap",
+};
+
+volatile int free_request_slot_num;
+
+void* throughput_input(void*) {
+    while (1) {
+        float new_throughput;
+        scanf("%f", &new_throughput);
+
+        while (pthread_mutex_trylock(
+                   &comm.requestQueue->requests[free_request_slot_num].throughput_mutex) != 0)
+            ;
+
+        comm.requestQueue->requests[free_request_slot_num].throughput = new_throughput;
+        pthread_mutex_unlock(&comm.requestQueue->requests[free_request_slot_num].throughput_mutex);
+    }
+}
+
+int main(int argc, char** argv) {
+    Arguments arguments = {0, 0, 0};  // Default values
+
+    // Parse arguments
+    argp_parse(&argp, argc, argv, 0, 0, &arguments);
+
+    init_shm(false, comm, arguments.shm_location);
+
+    // Sanitize args
+    int prio = 0;
+    if (arguments.prio >= NB_PRIORITIES) {
+        prio = NB_PRIORITIES - 1;
+        printf("Specified prio is higer than the max, using the max (%d)\n", prio);
+    } else if (arguments.prio < 0) {
+        prio = 0;
+        printf("Specified prio is lower than 0, using 0\n");
+    } else {
+        prio = arguments.prio;
+    }
+
+    if (arguments.shutdown) {
+        printf("Sending shutdown request to server...");
+        shutdown_server();
+        return 0;
+    }
+
+    pthread_t t_user_input;  // Declare a thread
+    if (arguments.active_throughput) {
+        pthread_create(&t_user_input, NULL, throughput_input, NULL);  // Create the thread
+    }
+
+    id = arguments.id;
+    enum Lora lora = (enum Lora)arguments.lora_number;
+
+    int loop_forever = arguments.repeats == -1 ? 1 : 0;
+
+    while (loop_forever || arguments.repeats > 0) {
+        free_request_slot_num = find_free_shm_request();
+        if (free_request_slot_num == -1) {
+            continue;
+        }
+
+        arguments.repeats--;
+
+        const char* prompt = prompts[id % 10];
+        const int n_chars_to_gen = 2048;
+
+        printf("Sending the following request:\n"
+               "-> %s\n",
+               prompt);
+        fflush(stdout);
+
+        write_to_shm(comm.requestQueue, prompt, n_chars_to_gen, free_request_slot_num, prio, lora,
+                     arguments.throughput);
+
+        // Notify the server that there is a new request
+        sem_post(&comm.requestQueue->requests[free_request_slot_num].serverNotifier);
+        sem_post(&comm.requestQueue->active_reqs);
+
+        // This will block until the server notifies that the request is completed
+        print_from_shm(comm.requestQueue, strlen(prompt), n_chars_to_gen, free_request_slot_num);
+
+        free_request(free_request_slot_num);
+
+        if (arguments.sleep_time > 0) {
+            sleep(arguments.sleep_time);
+        }
+    }
+
+    if (arguments.active_throughput) {
+        pthread_exit(&t_user_input);
+    }
+
+    return 0;
+}
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/hypervisor/hypervisor.cpp b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/hypervisor/hypervisor.cpp
new file mode 100644
index 000000000..d03980edb
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/hypervisor/hypervisor.cpp
@@ -0,0 +1,134 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include "shm.h"
+
+#define QMP_SOCKET_SERVER_PATH "/tmp/qmp-socket-server"
+#define QMP_SOCKET_CLIENT_PATH "/tmp/qmp-socket-client"
+#define BUFFER_SIZE 1024
+
+const char *qmp_command_del = "{ \"execute\": \"device_del \", \"arguments\": {\"nvidia_gpu\"} }\n";
+const char *qmp_command_add = "{ \"execute\": \"device_add \", \"arguments\": {\"vfio-pci\",\"host=ca:00.0\",\"id=nvidia_gpu\"} }\n";
+
+static long round_up(long n, long mult)
+{
+    return ((n + mult - 1) / mult) * mult;
+}
+
+void send_qmp_command(int sock, const char *command) {
+    if (write(sock, command, strlen(command)) < 0) {
+        perror("write");
+        exit(EXIT_FAILURE);
+    }
+}
+
+void read_qmp_response(int sock) {
+    char buffer[BUFFER_SIZE];
+    ssize_t bytes_read;
+
+    bytes_read = read(sock, buffer, sizeof(buffer) - 1);
+    if (bytes_read > 0) {
+        buffer[bytes_read] = '\0';
+        printf("Response:\n%s\n", buffer);
+    } else {
+        perror("read");
+        exit(EXIT_FAILURE);
+    }
+}
+
+struct Communication comm;
+
+int init_shm(){
+    int fd = open("/dev/shm/shm1", O_RDWR);
+    assert(fd != -1);
+
+    long pagesize = sysconf(_SC_PAGESIZE);
+    long shm_size = round_up(sizeof(struct SharedMemory), pagesize);
+    if (ftruncate(fd, shm_size) == -1) {
+        perror("ftruncate");
+        return -1;
+    }
+
+    comm.requestQueue = (SharedMemory*) mmap(0, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+    assert(comm.requestQueue != NULL);
+    assert(comm.requestQueue != MAP_FAILED);
+
+    close(fd);
+
+    return 0;
+}
+
+int main(void){
+    init_shm();
+    int sock_server, sock_client;
+    struct sockaddr_un addr_server, addr_client;
+
+    // Create a Unix domain socket
+    if ((sock_server = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+        perror("socket");
+        exit(EXIT_FAILURE);
+    }
+    if ((sock_client = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+        perror("socket");
+        exit(EXIT_FAILURE);
+    }
+
+    // Set up the socket address structure
+    memset(&addr_server, 0, sizeof(addr_server));
+    addr_server.sun_family = AF_UNIX;
+    strncpy(addr_server.sun_path, QMP_SOCKET_SERVER_PATH, sizeof(addr_server.sun_path) - 1);
+    
+    memset(&addr_client, 0, sizeof(addr_client));
+    addr_client.sun_family = AF_UNIX;
+    strncpy(addr_client.sun_path, QMP_SOCKET_CLIENT_PATH, sizeof(addr_client.sun_path) - 1);
+
+    // Connect to QEMU's QMP socket
+    if (connect(sock_server, (struct sockaddr *)&addr_server, sizeof(addr_server)) == -1) {
+        perror("connect");
+        exit(EXIT_FAILURE);
+    }
+    if (connect(sock_client, (struct sockaddr *)&addr_client, sizeof(addr_client)) == -1) {
+        perror("connect");
+        exit(EXIT_FAILURE);
+    }
+
+    // Read QMP greeting message
+    read_qmp_response(sock_server);
+    read_qmp_response(sock_client);
+
+    // Enable QMP capabilities
+    const char *qmp_capabilities = "{ \"execute\": \"qmp_capabilities\" }\n";
+    send_qmp_command(sock_server, qmp_capabilities);
+    read_qmp_response(sock_server);
+    send_qmp_command(sock_client, qmp_capabilities);
+    read_qmp_response(sock_client);
+    
+    while(true){
+        if(sem_trywait(&comm.requestQueue->control.hypervisor_switch_notification) != 0){
+            continue;
+        }
+        printf("Hypervisor received GPU request...\n");
+        switch(comm.requestQueue->control.target){
+            case HOST:
+                // device_del the GPU from the LLMOS
+                send_qmp_command(sock_server, qmp_command_del);
+                read_qmp_response(sock_server);
+                // device_add the GPU to client
+                send_qmp_command(sock_client, qmp_command_add);
+                read_qmp_response(sock_client);
+                break;
+            default:
+                break;
+        };
+        sem_post(&comm.requestQueue->control.switch_complete);
+    }
+    close(sock_server);
+    close(sock_client);
+}
\ No newline at end of file
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/args_client.h b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/args_client.h
new file mode 100644
index 000000000..bdf452207
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/args_client.h
@@ -0,0 +1,84 @@
+#include <argp.h>
+
+#include <iostream>
+#include <string>
+
+#define ARGUMENT_SHM_SHORT 0x1
+
+// Structure to hold parsed arguments
+struct Arguments {
+    int id;
+    int prio;
+    int lora_number = 0;  // defaults to no lora (0)
+    int repeats = -1;     // defaults zo infinite
+    int sleep_time = 3;
+    int clean = 0;
+    int shutdown = 0;
+    float throughput = 999;
+    int active_throughput = 0;
+    std::string shm_location;
+};
+
+// Program documentation
+static char doc[] = "Client-side argument parser using argp.h";
+
+// Options
+static struct argp_option options[] = {
+    {"id", 'i', "ID", 0, "User ID (Required)"},
+    {"prio", 'p', "PRIO", 0, "Priority level (Default = 0)"},
+    {"lora_number", 'l', "NUM", 0, "Number of LoRA (Default = 0 / No LoRA)"},
+    {"repeat", 'r', "REP", 0, "Number of requests to send (Default = -1 / Infinite)"},
+    {"sleep", 's', "SLP", 0, "Seconds to sleep between requests (Default = 3). 0 means no sleep"},
+    {"throughput", 'd', "THORUGHPUT", 0, "Throughput limit"},
+    {"active-throughput", 'a', 0, 0, "Creates a thread, listening the user input for new throughput limits"},
+    {"shutdown", 'w', 0, 0, "Send shutdown request to the server"},
+    {"shared-mem", ARGUMENT_SHM_SHORT, "SHM_LOCATION", 0, "Specify shm locations"},
+    {0}};
+
+// Argument parser function
+static error_t parse_opt(int key, char *arg, struct argp_state *state) {
+    Arguments *arguments = static_cast<Arguments *>(state->input);
+
+    switch (key) {
+        case 'i':
+            arguments->id = std::stoi(arg);
+            break;
+        case 'p':
+            arguments->prio = std::stoi(arg);
+            break;
+        case 'l':
+            arguments->lora_number = std::stoi(arg);
+            break;
+        case 'r':
+            arguments->repeats = std::stoi(arg);
+            break;
+        case 's':
+            arguments->sleep_time = std::stoi(arg);
+            break;
+        case 'd':
+            arguments->throughput = std::stof(arg);
+            break;
+        case 'a':
+            arguments->active_throughput = 1;
+            break;
+        case 'w':
+            arguments->shutdown = 1;
+            arguments->prio = 0;  // highest prio
+            arguments->repeats = 1;
+            break;
+        case ARGUMENT_SHM_SHORT:
+            arguments->shm_location = arg;
+            break;
+        case ARGP_KEY_END:
+            if (arguments->id == 0 && arguments->shutdown == 0) {
+                argp_usage(state);  // Print usage and exit if required args are missing
+            }
+            break;
+        default:
+            return ARGP_ERR_UNKNOWN;
+    }
+    return 0;
+}
+
+// Argp parser setup
+static struct argp argp = {options, parse_opt, nullptr, doc};
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/args_server.h b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/args_server.h
new file mode 100644
index 000000000..f8af5f321
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/args_server.h
@@ -0,0 +1,106 @@
+#include <argp.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <limits.h>
+#define ARGUMENT_SHM_SHORT 0x1
+#define ARGUMENT_THREAD 0x2
+
+struct Arguments {
+    std::string model_path;               // Required model path
+    std::vector<std::string> lora_paths;  // Optional multiple LoRA paths
+    bool state_save = false;
+    bool kv_save = false;
+    bool auto_policy = false;
+    bool llama_bench_args;
+    float throughput_limit;
+    int tokens_to_gen;
+    unsigned int token_limit = UINT_MAX;
+    unsigned int n_threads = 1;
+    std::string performance_metrics_file;
+    std::string sig_file;
+    std::string shm_location;
+};
+
+// Program documentation
+static char doc[] = "LLM-OS Server";
+
+// Options
+static struct argp_option options[] = {
+    {"model", 'm', "FILE", 0, "Path to the model file (Required)"},
+    {"lora", 'l', "FILE", 0, "Path to a LoRA file (Can be specified multiple times)"},
+    {"perf-metrics", 'p', "FILE", 0, "Path to a performance metrics file"},
+    {"state-save", 's', 0, 0, "Enable state save and load between context switches"},
+    {"kv-save", 'k', 0, 0, "Enable kv cache save and load between context switches"},
+    {"auto-policy", 'a', 0, 0, "Choose the kv cache policy automatically"},
+    {"llama-bench-args", 'b', 0, 0, "Use llama-bench defaults"},
+    {"throughput-limit", 't', "LIMIT", 0, "Upper thoughput limit for the server (tokens/s)"},
+    {"n_threads", ARGUMENT_THREAD, "N_THREADS", 0, "Number of threads for inference"},
+    {"tokens-to-gen", 'g', "NB_TOKENS", 0,
+     "Number of tokens to generate before engaging the round robin mechanism and performing a "
+     "context switch"},
+    {"token_limit", 'z', "NB_TOKENS", 0,
+     "Number of total tokens to generate before quiting"},
+    {"signal-file", 'f', "FILE", 0,
+     "Create a temporary file at the start and delete it at the end"},
+    {"shared-mem", ARGUMENT_SHM_SHORT, "SHM_LOCATION", 0, "Specify shm locations"},
+    {0}};
+
+// Argument parser function
+static error_t parse_opt(int key, char *arg, struct argp_state *state) {
+    Arguments *arguments = static_cast<Arguments *>(state->input);
+
+    switch (key) {
+        case 'm':
+            arguments->model_path = arg;
+            break;
+        case 'l':
+            arguments->lora_paths.push_back(arg);
+            break;
+        case 's':
+            arguments->state_save = true;
+            break;
+        case 'k':
+            arguments->kv_save = true;
+            break;
+        case 'b':
+            arguments->llama_bench_args = true;
+            break;
+        case 'a':
+            arguments->auto_policy = true;
+            break;
+        case 't':
+            arguments->throughput_limit = atof(arg);
+            break;
+        case 'g':
+            arguments->tokens_to_gen = atoi(arg);
+            break;
+        case 'p':
+            arguments->performance_metrics_file = arg;
+            break;
+        case 'f':
+            arguments->sig_file = arg;
+            break;
+        case 'z':
+            arguments->token_limit = atoi(arg);
+            break;
+        case ARGUMENT_SHM_SHORT:
+            arguments->shm_location = arg;
+            break;
+        case ARGUMENT_THREAD:
+            arguments->n_threads = atoi(arg);
+            break;
+        case ARGP_KEY_END:
+            if (arguments->model_path.empty()) {
+                argp_usage(state);  // Print usage and exit if model_path is missing
+            }
+            break;
+        default:
+            return ARGP_ERR_UNKNOWN;
+    }
+    return 0;
+}
+
+// Argp parser setup
+static struct argp argp = {options, parse_opt, nullptr, doc};
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/misc.h b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/misc.h
new file mode 100644
index 000000000..5782f5d27
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/misc.h
@@ -0,0 +1,5 @@
+#pragma once
+
+inline long round_up(long n, long mult) {
+    return ((n + mult - 1) / mult) * mult;
+}
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/shm.h b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/shm.h
new file mode 100644
index 000000000..d22896318
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/include/shm.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <pthread.h>
+#include <semaphore.h>
+
+#include <string>
+
+#define SHM_LOCATION "/sys/bus/pci/devices/0000:00:12.0/resource2"
+#define SHM_SIZE (32 * 1024 * 1024)
+#define MAX_REQUESTS 512
+#define MAX_TEXT 8196
+#define NB_PRIORITIES 3
+
+enum Lora { NO_LORA, SQL, FOOD, SIZE };
+
+struct Request {
+    pthread_mutex_t mutex;
+    pthread_mutex_t throughput_mutex;
+    sem_t clientNotifier;
+    sem_t serverNotifier;
+    int id;
+    int n_chars_to_gen;
+    char text[MAX_TEXT];
+    int prio;
+    enum Lora lora;
+    float throughput;
+};
+
+struct SharedMemory {
+    struct Request requests[MAX_REQUESTS];
+    sem_t active_reqs;
+    sem_t run;  // server will run while this is larger than 0
+};
+
+struct Communication {
+    struct SharedMemory* requestQueue;
+};
+
+int init_shm(bool create, Communication& comm, std::string optional_shm_location);
+int clean_shm(struct SharedMemory* shm);
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/client_request.h b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/client_request.h
new file mode 100644
index 000000000..2c1dd1723
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/client_request.h
@@ -0,0 +1,305 @@
+#pragma once
+
+#include "defines.h"
+#include "llama.h"
+#include "shm.h"
+
+class ClientRequest {
+private:
+    // Raw request lying in the shared memory
+    Request *request;
+
+public:
+    std::string prompt;
+    int n_chars_to_gen;
+    int id;
+    int prio;
+    enum Lora lora;
+    float throughput = 999;
+
+    // Following are used to keep the llama.cpp context
+    std::vector<llama_token> prompt_tokens;
+    llama_batch batch;
+    llama_context *ctx;
+    llama_sampler *smpl;
+    int new_token_id;
+    int n_generated_char = 0;
+    int n_past_tokens = 0;
+    uint8_t *state_save = NULL;
+
+    // KV Cache
+    enum Policy policy = general_policy;  // Apply the default policy at start
+    uint8_t *kv_cache_backup = NULL;
+    size_t kv_cache_size = 0;
+
+    bool initialized = false;  // Each new request must be initilized with a new context
+    bool done = false;         // We remove the done requests at the main loop
+    bool require_context_switch =
+        false;  // True if the request is newly scheduled after a different request
+
+    // For debugging purposes
+    int scheduler_counter = 0;
+    llama_kv_cache_view kv_view;
+    int n_past_tokens_mod = 0;
+    bool limit_hit = false;
+
+    ClientRequest(Request *request, std::string prompt, int n_chars_to_gen, int id, int prio,
+                  enum Lora lora)
+        : request(request),
+          prompt(prompt),
+          n_chars_to_gen(n_chars_to_gen),
+          id(id),
+          prio(prio),
+          lora(lora) {
+    }
+
+    Request *get_raw_request() {
+        return request;
+    }
+    int get_current_length() {
+        return strlen(request->text);
+    };
+
+    float get_throughput() {
+        if (pthread_mutex_trylock(&request->throughput_mutex) == 0) {
+            throughput = request->throughput;
+            pthread_mutex_unlock(&request->throughput_mutex);
+        }
+
+        return throughput;
+    }
+
+    friend std::ostream &operator<<(std::ostream &os, const ClientRequest &obj) {
+        os << "Printing request...\n"
+           << "\t- id: " << obj.id << "\n"
+           << "\t- prio: " << obj.prio << "\n"
+           << "\t- lora: " << obj.lora << "\n"
+           << "\t- # of times this request has been scheduled: " << obj.scheduler_counter
+           << "\n"
+           //   << "\t- # of requests from the client: " << client_request_counter[obj.id] <<
+           //   std::endl;
+           << std::endl;
+        return os;
+    }
+
+    void print_kv_cache() {
+        std::cout << "Used cells: " << kv_view.used_cells << "\\" << kv_view.n_cells << std::endl;
+    }
+
+    void apply_lora() {
+        if (lora != NO_LORA) {
+            if ((size_t)lora >= lora_adapters.size() + 1) {
+                std::cout << "Wrong lora index" << std::endl;
+                // We can continue without applying any lora here
+                // To detect the issue easily we exit for now
+                exit(0);
+            } else {
+                llama_lora_adapter_set(ctx, lora_adapters.at(lora - 1), 1);
+            }
+        }
+    }
+
+    void init_from_clean_kv() {
+        std::cout << "\tLoading context without kv cache..." << std::endl;
+        // initialize the context
+        llama_context_params ctx_params = llama_context_default_params();
+        // n_ctx is the context size
+        ctx_params.n_ctx = CTX_SIZE;
+        // n_batch is the maximum number of tokens that can be processed in a
+        // single call to llama_decode ctx_params.n_batch = n_prompt;
+        ctx_params.n_batch = BATCH_SIZE;
+        // enable performance counters
+        ctx_params.no_perf = false;
+        ctx_params.offload_kqv = true;
+
+        if (!use_llama_bench_args) {
+            ctx = llama_new_context_with_model(model, ctx_params);
+        } else {
+            ctx = llama_new_context_with_model(model, llama_bench_args.to_llama_cparams());
+        }
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
+            exit(1);
+        }
+
+        // initialize the sampler
+
+        auto sparams = llama_sampler_chain_default_params();
+        sparams.no_perf = false;
+        smpl = llama_sampler_chain_init(sparams);
+
+        llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
+        n_past_tokens = 0;
+
+        // prepare the batch
+        batch = llama_batch_init(prompt_tokens.size(), 0, 1);
+        for (size_t i = 0; i < prompt_tokens.size(); i++) {
+            common_batch_add(batch, prompt_tokens[i], i, {0}, false);
+        }
+        batch.logits[batch.n_tokens - 1] = true;  // generate next token
+    }
+
+    void init_from_saved_kv() {
+        std::cout << "\tLoading kv cache..." << std::endl;
+        // initialize the context
+        llama_context_params ctx_params = llama_context_default_params();
+        // n_ctx is the context size
+        ctx_params.n_ctx = CTX_SIZE;
+        // n_batch is the maximum number of tokens that can be processed in a
+        // single call to llama_decode ctx_params.n_batch = n_prompt;
+        ctx_params.n_batch = BATCH_SIZE;
+        // enable performance counters
+        ctx_params.no_perf = false;
+        ctx_params.offload_kqv = true;
+
+        if (!use_llama_bench_args) {
+            ctx = llama_new_context_with_model(model, ctx_params);
+        } else {
+            ctx = llama_new_context_with_model(model, llama_bench_args.to_llama_cparams());
+        }
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
+            exit(1);
+        }
+
+        // initialize the sampler
+
+        auto sparams = llama_sampler_chain_default_params();
+        sparams.no_perf = false;
+        smpl = llama_sampler_chain_init(sparams);
+
+        llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
+        llama_state_seq_set_data(ctx, kv_cache_backup, kv_cache_size, 0);
+
+        // set_state_data does not restore the batch, therefore we need to reinit it (since we freed
+        // it) and add the last sampled token
+        batch = llama_batch_init(1, 0, 1);
+        common_batch_clear(batch);
+        common_batch_add(batch, new_token_id, n_past_tokens, {0}, true);
+        batch.logits[batch.n_tokens - 1] = true;
+    }
+
+    void init_ctx_from_save_state() {
+        std::cout << "\tLoading saved context..." << std::endl;
+        // initialize the context
+
+        llama_context_params ctx_params = llama_context_default_params();
+        // n_ctx is the context size
+        ctx_params.n_ctx = CTX_SIZE;
+        // n_batch is the maximum number of tokens that can be processed in a
+        // single call to llama_decode ctx_params.n_batch = n_prompt;
+        ctx_params.n_batch = BATCH_SIZE;
+        // enable performance counters
+        ctx_params.no_perf = false;
+
+        if (!use_llama_bench_args) {
+            ctx = llama_new_context_with_model(model, ctx_params);
+        } else {
+            ctx = llama_new_context_with_model(model, llama_bench_args.to_llama_cparams());
+        }
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
+            exit(1);
+        }
+
+        // initialize the sampler
+
+        auto sparams = llama_sampler_chain_default_params();
+        sparams.no_perf = false;
+        smpl = llama_sampler_chain_init(sparams);
+
+        llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
+        llama_set_state_data(ctx, state_save);
+
+        // set_state_data does not restore the batch, therefore we need to reinit it (since we freed
+        // it) and add the last sampled token
+        batch = llama_batch_init(1, 0, 1);
+        common_batch_clear(batch);
+        common_batch_add(batch, new_token_id, n_past_tokens, {0}, true);
+        batch.logits[batch.n_tokens - 1] = true;
+    }
+
+    void init_llama_context() {
+        // tokenize the prompt
+
+        // find the number of tokens in the prompt
+        const int n_prompt =
+            -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
+
+        // allocate space for the tokens and tokenize the prompt
+        prompt_tokens.resize(n_prompt);
+        if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(),
+                           prompt_tokens.size(), true, true) < 0) {
+            fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
+            exit(1);
+        }
+
+        // initialize the context
+
+        llama_context_params ctx_params = llama_context_default_params();
+        // n_ctx is the context size
+        ctx_params.n_ctx = CTX_SIZE;
+        // n_batch is the maximum number of tokens that can be processed in a
+        // single call to llama_decode ctx_params.n_batch = n_prompt;
+        ctx_params.n_batch = BATCH_SIZE;
+        // enable performance counters
+        ctx_params.no_perf = false;
+
+        if (!use_llama_bench_args) {
+            ctx = llama_new_context_with_model(model, ctx_params);
+        } else {
+            ctx = llama_new_context_with_model(model, llama_bench_args.to_llama_cparams());
+        }
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
+            exit(1);
+        }
+
+        kv_view = llama_kv_cache_view_init(ctx, 1);
+
+        // initialize the sampler
+
+        auto sparams = llama_sampler_chain_default_params();
+        sparams.no_perf = false;
+        smpl = llama_sampler_chain_init(sparams);
+
+        llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
+        // print the prompt token-by-token
+
+        std::cout << "Request text:\n\t";
+        for (auto id : prompt_tokens) {
+            char buf[128];
+            int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
+            if (n < 0) {
+                fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
+                exit(1);
+            }
+            std::string s(buf, n);
+            std::cout << s;
+        }
+        std::cout << std::endl;
+
+        // prepare the batch
+        batch = llama_batch_init(prompt_tokens.size(), 0, 1);
+        for (size_t i = 0; i < prompt_tokens.size(); i++) {
+            common_batch_add(batch, prompt_tokens[i], i, {0}, true);
+        }
+        batch.logits[batch.n_tokens - 1] = true;  // generate next token
+    }
+
+    void auto_set_kv_policy() {
+        llama_kv_cache_view_update(ctx, &kv_view);
+        if (policy != Policy::save_load_full && kv_view.used_cells >= KV_CACHE_POLICY_LIMIT) {
+            policy = Policy::save_load_full;
+            std::cout << "Changing KV cache policy to save/load" << std::endl;
+        }
+    }
+};
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/defines.h b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/defines.h
new file mode 100644
index 000000000..30563baa4
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/defines.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <unordered_map>
+#include "llama.h"
+#include "llama_bench_args.h"
+
+#define CTX_SIZE 2048  // Also KV cache size
+#define BATCH_SIZE 2048
+
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+#define KV_CACHE_LIMIT 2000
+#define KV_CACHE_POLICY_LIMIT 2000
+
+// just a really low priority for now, set this to a really high number
+#define NO_PRIORITY 1000
+enum Policy { save_load_full, save_load_kv, recompute };
+extern Policy general_policy;
+extern std::unordered_map<int, llama_lora_adapter *> lora_adapters;
+extern bool use_llama_bench_args;
+extern llama_model *model;
+extern cmd_params_instance llama_bench_args;
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/llama_bench_args.h b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/llama_bench_args.h
new file mode 100644
index 000000000..e4828fce5
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/llama_bench_args.h
@@ -0,0 +1,796 @@
+#pragma once
+
+#include <regex>
+#include "llama.h"
+#include "common.h"
+
+template <class T>
+static std::string join(const std::vector<T>& values, const std::string& delim) {
+    std::ostringstream str;
+    for (size_t i = 0; i < values.size(); i++) {
+        str << values[i];
+        if (i < values.size() - 1) {
+            str << delim;
+        }
+    }
+    return str.str();
+}
+
+// command line params
+enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN };
+
+static bool output_format_from_str(const std::string& s, output_formats& format) {
+    if (s == "none") {
+        format = NONE;
+    } else if (s == "csv") {
+        format = CSV;
+    } else if (s == "json") {
+        format = JSON;
+    } else if (s == "jsonl") {
+        format = JSONL;
+    } else if (s == "md") {
+        format = MARKDOWN;
+    } else {
+        return false;
+    }
+    return true;
+}
+
+struct cmd_params {
+    std::vector<std::string> model;
+    std::vector<int> n_prompt;
+    std::vector<int> n_gen;
+    std::vector<std::pair<int, int>> n_pg;
+    std::vector<int> n_batch;
+    std::vector<int> n_ubatch;
+    std::vector<ggml_type> type_k;
+    std::vector<ggml_type> type_v;
+    std::vector<int> n_threads;
+    std::vector<std::string> cpu_mask;
+    std::vector<bool> cpu_strict;
+    std::vector<int> poll;
+    std::vector<int> n_gpu_layers;
+    std::vector<std::string> rpc_servers;
+    std::vector<llama_split_mode> split_mode;
+    std::vector<int> main_gpu;
+    std::vector<bool> no_kv_offload;
+    std::vector<bool> flash_attn;
+    std::vector<std::vector<float>> tensor_split;
+    std::vector<bool> use_mmap;
+    std::vector<bool> embeddings;
+    ggml_numa_strategy numa;
+    int reps;
+    ggml_sched_priority prio;
+    int delay;
+    bool verbose;
+    bool progress;
+    output_formats output_format;
+    output_formats output_format_stderr;
+};
+
+static const cmd_params cmd_params_defaults = {
+    /* model                */ {"models/7B/ggml-model-q4_0.gguf"},
+    /* n_prompt             */ {512},
+    /* n_gen                */ {128},
+    /* n_pg                 */ {},
+    /* n_batch              */ {2048},
+    /* n_ubatch             */ {512},
+    /* type_k               */ {GGML_TYPE_F16},
+    /* type_v               */ {GGML_TYPE_F16},
+    /* n_threads            */ {cpu_get_num_math()},
+    /* cpu_mask             */ {"0x0"},
+    /* cpu_strict           */ {false},
+    /* poll                 */ {50},
+    /* n_gpu_layers         */ {99},
+    /* rpc_servers          */ {""},
+    /* split_mode           */ {LLAMA_SPLIT_MODE_LAYER},
+    /* main_gpu             */ {0},
+    /* no_kv_offload        */ {false},
+    /* flash_attn           */ {false},
+    /* tensor_split         */ {std::vector<float>(llama_max_devices(), 0.0f)},
+    /* use_mmap             */ {true},
+    /* embeddings           */ {false},
+    /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
+    /* reps                 */ 5,
+    /* prio                 */ GGML_SCHED_PRIO_NORMAL,
+    /* delay                */ 0,
+    /* verbose              */ false,
+    /* progress             */ false,
+    /* output_format        */ MARKDOWN,
+    /* output_format_stderr */ NONE,
+};
+
+static ggml_type ggml_type_from_name(const std::string& s) {
+    if (s == "f16") {
+        return GGML_TYPE_F16;
+    }
+    if (s == "q8_0") {
+        return GGML_TYPE_Q8_0;
+    }
+    if (s == "q4_0") {
+        return GGML_TYPE_Q4_0;
+    }
+    if (s == "q4_1") {
+        return GGML_TYPE_Q4_1;
+    }
+    if (s == "q5_0") {
+        return GGML_TYPE_Q5_0;
+    }
+    if (s == "q5_1") {
+        return GGML_TYPE_Q5_1;
+    }
+    if (s == "iq4_nl") {
+        return GGML_TYPE_IQ4_NL;
+    }
+
+    return GGML_TYPE_COUNT;
+}
+
+static cmd_params parse_cmd_params(int argc, char** argv) {
+    cmd_params params;
+    std::string arg;
+    bool invalid_param = false;
+    const std::string arg_prefix = "--";
+    const char split_delim = ',';
+
+    params.verbose = cmd_params_defaults.verbose;
+    params.output_format = cmd_params_defaults.output_format;
+    params.output_format_stderr = cmd_params_defaults.output_format_stderr;
+    params.reps = cmd_params_defaults.reps;
+    params.numa = cmd_params_defaults.numa;
+    params.prio = cmd_params_defaults.prio;
+    params.delay = cmd_params_defaults.delay;
+    params.progress = cmd_params_defaults.progress;
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        if (arg == "-h" || arg == "--help") {
+            exit(0);
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<std::string>(argv[i], split_delim);
+            params.model.insert(params.model.end(), p.begin(), p.end());
+        } else if (arg == "-p" || arg == "--n-prompt") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
+        } else if (arg == "-n" || arg == "--n-gen") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
+        } else if (arg == "-pg") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<std::string>(argv[i], ',');
+            if (p.size() != 2) {
+                invalid_param = true;
+                break;
+            }
+            params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
+        } else if (arg == "-b" || arg == "--batch-size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
+        } else if (arg == "-ub" || arg == "--ubatch-size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
+        } else if (arg == "-ctk" || arg == "--cache-type-k") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<std::string>(argv[i], split_delim);
+            std::vector<ggml_type> types;
+            for (const auto& t : p) {
+                ggml_type gt = ggml_type_from_name(t);
+                if (gt == GGML_TYPE_COUNT) {
+                    invalid_param = true;
+                    break;
+                }
+                types.push_back(gt);
+            }
+            if (invalid_param) {
+                break;
+            }
+            params.type_k.insert(params.type_k.end(), types.begin(), types.end());
+        } else if (arg == "-ctv" || arg == "--cache-type-v") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<std::string>(argv[i], split_delim);
+            std::vector<ggml_type> types;
+            for (const auto& t : p) {
+                ggml_type gt = ggml_type_from_name(t);
+                if (gt == GGML_TYPE_COUNT) {
+                    invalid_param = true;
+                    break;
+                }
+                types.push_back(gt);
+            }
+            if (invalid_param) {
+                break;
+            }
+            params.type_v.insert(params.type_v.end(), types.begin(), types.end());
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
+        } else if (arg == "-C" || arg == "--cpu-mask") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<std::string>(argv[i], split_delim);
+            params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
+        } else if (arg == "--cpu-strict") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<bool>(argv[i], split_delim);
+            params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
+        } else if (arg == "--poll") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.poll.insert(params.poll.end(), p.begin(), p.end());
+        } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
+        } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rpc_servers.push_back(argv[i]);
+        } else if (arg == "-sm" || arg == "--split-mode") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<std::string>(argv[i], split_delim);
+            std::vector<llama_split_mode> modes;
+            for (const auto& m : p) {
+                llama_split_mode mode;
+                if (m == "none") {
+                    mode = LLAMA_SPLIT_MODE_NONE;
+                } else if (m == "layer") {
+                    mode = LLAMA_SPLIT_MODE_LAYER;
+                } else if (m == "row") {
+                    mode = LLAMA_SPLIT_MODE_ROW;
+                } else {
+                    invalid_param = true;
+                    break;
+                }
+                modes.push_back(mode);
+            }
+            if (invalid_param) {
+                break;
+            }
+            params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
+        } else if (arg == "-mg" || arg == "--main-gpu") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.main_gpu = string_split<int>(argv[i], split_delim);
+        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<bool>(argv[i], split_delim);
+            params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
+        } else if (arg == "--numa") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            } else {
+                std::string value(argv[i]);
+                /**/ if (value == "distribute" || value == "") {
+                    params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
+                } else if (value == "isolate") {
+                    params.numa = GGML_NUMA_STRATEGY_ISOLATE;
+                } else if (value == "numactl") {
+                    params.numa = GGML_NUMA_STRATEGY_NUMACTL;
+                } else {
+                    invalid_param = true;
+                    break;
+                }
+            }
+        } else if (arg == "-fa" || arg == "--flash-attn") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<bool>(argv[i], split_delim);
+            params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
+        } else if (arg == "-mmp" || arg == "--mmap") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<bool>(argv[i], split_delim);
+            params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
+        } else if (arg == "-embd" || arg == "--embeddings") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<bool>(argv[i], split_delim);
+            params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
+        } else if (arg == "-ts" || arg == "--tensor-split") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            for (auto ts : string_split<std::string>(argv[i], split_delim)) {
+                // split string by ; and /
+                const std::regex regex{R"([;/]+)"};
+                std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
+                std::vector<std::string> split_arg{it, {}};
+                GGML_ASSERT(split_arg.size() <= llama_max_devices());
+
+                std::vector<float> tensor_split(llama_max_devices());
+                for (size_t i = 0; i < llama_max_devices(); ++i) {
+                    if (i < split_arg.size()) {
+                        tensor_split[i] = std::stof(split_arg[i]);
+                    } else {
+                        tensor_split[i] = 0.0f;
+                    }
+                }
+                params.tensor_split.push_back(tensor_split);
+            }
+        } else if (arg == "-r" || arg == "--repetitions") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.reps = std::stoi(argv[i]);
+        } else if (arg == "--prio") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.prio = (enum ggml_sched_priority)std::stoi(argv[i]);
+        } else if (arg == "--delay") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.delay = std::stoi(argv[i]);
+        } else if (arg == "-o" || arg == "--output") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            invalid_param = !output_format_from_str(argv[i], params.output_format);
+        } else if (arg == "-oe" || arg == "--output-err") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
+        } else if (arg == "-v" || arg == "--verbose") {
+            params.verbose = true;
+        } else if (arg == "--progress") {
+            params.progress = true;
+        } else {
+            invalid_param = true;
+            break;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+
+        exit(1);
+    }
+
+    // set defaults
+    if (params.model.empty()) {
+        params.model = cmd_params_defaults.model;
+    }
+    if (params.n_prompt.empty()) {
+        params.n_prompt = cmd_params_defaults.n_prompt;
+    }
+    if (params.n_gen.empty()) {
+        params.n_gen = cmd_params_defaults.n_gen;
+    }
+    if (params.n_pg.empty()) {
+        params.n_pg = cmd_params_defaults.n_pg;
+    }
+    if (params.n_batch.empty()) {
+        params.n_batch = cmd_params_defaults.n_batch;
+    }
+    if (params.n_ubatch.empty()) {
+        params.n_ubatch = cmd_params_defaults.n_ubatch;
+    }
+    if (params.type_k.empty()) {
+        params.type_k = cmd_params_defaults.type_k;
+    }
+    if (params.type_v.empty()) {
+        params.type_v = cmd_params_defaults.type_v;
+    }
+    if (params.n_gpu_layers.empty()) {
+        params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
+    }
+    if (params.rpc_servers.empty()) {
+        params.rpc_servers = cmd_params_defaults.rpc_servers;
+    }
+    if (params.split_mode.empty()) {
+        params.split_mode = cmd_params_defaults.split_mode;
+    }
+    if (params.main_gpu.empty()) {
+        params.main_gpu = cmd_params_defaults.main_gpu;
+    }
+    if (params.no_kv_offload.empty()) {
+        params.no_kv_offload = cmd_params_defaults.no_kv_offload;
+    }
+    if (params.flash_attn.empty()) {
+        params.flash_attn = cmd_params_defaults.flash_attn;
+    }
+    if (params.tensor_split.empty()) {
+        params.tensor_split = cmd_params_defaults.tensor_split;
+    }
+    if (params.use_mmap.empty()) {
+        params.use_mmap = cmd_params_defaults.use_mmap;
+    }
+    if (params.embeddings.empty()) {
+        params.embeddings = cmd_params_defaults.embeddings;
+    }
+    if (params.n_threads.empty()) {
+        params.n_threads = cmd_params_defaults.n_threads;
+    }
+    if (params.cpu_mask.empty()) {
+        params.cpu_mask = cmd_params_defaults.cpu_mask;
+    }
+    if (params.cpu_strict.empty()) {
+        params.cpu_strict = cmd_params_defaults.cpu_strict;
+    }
+    if (params.poll.empty()) {
+        params.poll = cmd_params_defaults.poll;
+    }
+
+    return params;
+}
+
+struct cmd_params_instance {
+    std::string model;
+    int n_prompt;
+    int n_gen;
+    int n_batch;
+    int n_ubatch;
+    ggml_type type_k;
+    ggml_type type_v;
+    int n_threads;
+    std::string cpu_mask;
+    bool cpu_strict;
+    int poll;
+    int n_gpu_layers;
+    std::string rpc_servers;
+    llama_split_mode split_mode;
+    int main_gpu;
+    bool no_kv_offload;
+    bool flash_attn;
+    std::vector<float> tensor_split;
+    bool use_mmap;
+    bool embeddings;
+
+    llama_model_params to_llama_mparams() const {
+        llama_model_params mparams = llama_model_default_params();
+
+        mparams.n_gpu_layers = n_gpu_layers;
+        if (!rpc_servers.empty()) {
+            mparams.rpc_servers = rpc_servers.c_str();
+        }
+        mparams.split_mode = split_mode;
+        mparams.main_gpu = main_gpu;
+        mparams.tensor_split = tensor_split.data();
+        mparams.use_mmap = use_mmap;
+
+        return mparams;
+    }
+
+    bool equal_mparams(const cmd_params_instance& other) const {
+        return model == other.model && n_gpu_layers == other.n_gpu_layers &&
+               rpc_servers == other.rpc_servers && split_mode == other.split_mode &&
+               main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
+               tensor_split == other.tensor_split;
+    }
+
+    llama_context_params to_llama_cparams() const {
+        llama_context_params cparams = llama_context_default_params();
+
+        cparams.n_ctx = n_prompt + n_gen;
+        cparams.n_batch = n_batch;
+        cparams.n_ubatch = n_ubatch;
+        cparams.type_k = type_k;
+        cparams.type_v = type_v;
+        cparams.offload_kqv = !no_kv_offload;
+        cparams.flash_attn = flash_attn;
+        cparams.embeddings = embeddings;
+
+        return cparams;
+    }
+};
+
+static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params& params) {
+    std::vector<cmd_params_instance> instances;
+
+    // this ordering minimizes the number of times that each model needs to be reloaded
+    for (const auto& m : params.model)
+        for (const auto& nl : params.n_gpu_layers)
+            for (const auto& rpc : params.rpc_servers)
+                for (const auto& sm : params.split_mode)
+                    for (const auto& mg : params.main_gpu)
+                        for (const auto& ts : params.tensor_split)
+                            for (const auto& mmp : params.use_mmap)
+                                for (const auto& embd : params.embeddings)
+                                    for (const auto& nb : params.n_batch)
+                                        for (const auto& nub : params.n_ubatch)
+                                            for (const auto& tk : params.type_k)
+                                                for (const auto& tv : params.type_v)
+                                                    for (const auto& nkvo : params.no_kv_offload)
+                                                        for (const auto& fa : params.flash_attn)
+                                                            for (const auto& nt : params.n_threads)
+                                                                for (const auto& cm :
+                                                                     params.cpu_mask)
+                                                                    for (const auto& cs :
+                                                                         params.cpu_strict)
+                                                                        for (const auto& pl :
+                                                                             params.poll) {
+                                                                            for (const auto&
+                                                                                     n_prompt :
+                                                                                 params.n_prompt) {
+                                                                                if (n_prompt == 0) {
+                                                                                    continue;
+                                                                                }
+                                                                                cmd_params_instance
+                                                                                    instance = {
+                                                                                        /* .model =
+                                                                                         */
+                                                                                        m,
+                                                                                        /* .n_prompt
+                                                                                           = */
+                                                                                        n_prompt,
+                                                                                        /* .n_gen =
+                                                                                         */
+                                                                                        0,
+                                                                                        /* .n_batch
+                                                                                           = */
+                                                                                        nb,
+                                                                                        /* .n_ubatch
+                                                                                           = */
+                                                                                        nub,
+                                                                                        /* .type_k
+                                                                                           = */
+                                                                                        tk,
+                                                                                        /* .type_v
+                                                                                           = */
+                                                                                        tv,
+                                                                                        /* .n_threads
+                                                                                           = */
+                                                                                        nt,
+                                                                                        /* .cpu_mask
+                                                                                           = */
+                                                                                        cm,
+                                                                                        /* .cpu_strict
+                                                                                           = */
+                                                                                        cs,
+                                                                                        /* .poll =
+                                                                                         */
+                                                                                        pl,
+                                                                                        /* .n_gpu_layers
+                                                                                           = */
+                                                                                        nl,
+                                                                                        /* .rpc_servers
+                                                                                           = */
+                                                                                        rpc,
+                                                                                        /* .split_mode
+                                                                                           = */
+                                                                                        sm,
+                                                                                        /* .main_gpu
+                                                                                           = */
+                                                                                        mg,
+                                                                                        /* .no_kv_offload=
+                                                                                         */
+                                                                                        nkvo,
+                                                                                        /* .flash_attn
+                                                                                           = */
+                                                                                        fa,
+                                                                                        /* .tensor_split
+                                                                                           = */
+                                                                                        ts,
+                                                                                        /* .use_mmap
+                                                                                           = */
+                                                                                        mmp,
+                                                                                        /* .embeddings
+                                                                                           = */
+                                                                                        embd,
+                                                                                    };
+                                                                                instances.push_back(
+                                                                                    instance);
+                                                                            }
+
+                                                                            for (const auto& n_gen :
+                                                                                 params.n_gen) {
+                                                                                if (n_gen == 0) {
+                                                                                    continue;
+                                                                                }
+                                                                                cmd_params_instance
+                                                                                    instance = {
+                                                                                        /* .model =
+                                                                                         */
+                                                                                        m,
+                                                                                        /* .n_prompt
+                                                                                           = */
+                                                                                        0,
+                                                                                        /* .n_gen =
+                                                                                         */
+                                                                                        n_gen,
+                                                                                        /* .n_batch
+                                                                                           = */
+                                                                                        nb,
+                                                                                        /* .n_ubatch
+                                                                                           = */
+                                                                                        nub,
+                                                                                        /* .type_k
+                                                                                           = */
+                                                                                        tk,
+                                                                                        /* .type_v
+                                                                                           = */
+                                                                                        tv,
+                                                                                        /* .n_threads
+                                                                                           = */
+                                                                                        nt,
+                                                                                        /* .cpu_mask
+                                                                                           = */
+                                                                                        cm,
+                                                                                        /* .cpu_strict
+                                                                                           = */
+                                                                                        cs,
+                                                                                        /* .poll =
+                                                                                         */
+                                                                                        pl,
+                                                                                        /* .n_gpu_layers
+                                                                                           = */
+                                                                                        nl,
+                                                                                        /* .rpc_servers
+                                                                                           = */
+                                                                                        rpc,
+                                                                                        /* .split_mode
+                                                                                           = */
+                                                                                        sm,
+                                                                                        /* .main_gpu
+                                                                                           = */
+                                                                                        mg,
+                                                                                        /* .no_kv_offload=
+                                                                                         */
+                                                                                        nkvo,
+                                                                                        /* .flash_attn
+                                                                                           = */
+                                                                                        fa,
+                                                                                        /* .tensor_split
+                                                                                           = */
+                                                                                        ts,
+                                                                                        /* .use_mmap
+                                                                                           = */
+                                                                                        mmp,
+                                                                                        /* .embeddings
+                                                                                           = */
+                                                                                        embd,
+                                                                                    };
+                                                                                instances.push_back(
+                                                                                    instance);
+                                                                            }
+
+                                                                            for (const auto& n_pg :
+                                                                                 params.n_pg) {
+                                                                                if (n_pg.first ==
+                                                                                        0 &&
+                                                                                    n_pg.second ==
+                                                                                        0) {
+                                                                                    continue;
+                                                                                }
+                                                                                cmd_params_instance
+                                                                                    instance = {
+                                                                                        /* .model =
+                                                                                         */
+                                                                                        m,
+                                                                                        /* .n_prompt
+                                                                                           = */
+                                                                                        n_pg.first,
+                                                                                        /* .n_gen =
+                                                                                         */
+                                                                                        n_pg.second,
+                                                                                        /* .n_batch
+                                                                                           = */
+                                                                                        nb,
+                                                                                        /* .n_ubatch
+                                                                                           = */
+                                                                                        nub,
+                                                                                        /* .type_k
+                                                                                           = */
+                                                                                        tk,
+                                                                                        /* .type_v
+                                                                                           = */
+                                                                                        tv,
+                                                                                        /* .n_threads
+                                                                                           = */
+                                                                                        nt,
+                                                                                        /* .cpu_mask
+                                                                                           = */
+                                                                                        cm,
+                                                                                        /* .cpu_strict
+                                                                                           = */
+                                                                                        cs,
+                                                                                        /* .poll =
+                                                                                         */
+                                                                                        pl,
+                                                                                        /* .n_gpu_layers
+                                                                                           = */
+                                                                                        nl,
+                                                                                        /* .rpc_servers
+                                                                                           = */
+                                                                                        rpc,
+                                                                                        /* .split_mode
+                                                                                           = */
+                                                                                        sm,
+                                                                                        /* .main_gpu
+                                                                                           = */
+                                                                                        mg,
+                                                                                        /* .no_kv_offload=
+                                                                                         */
+                                                                                        nkvo,
+                                                                                        /* .flash_attn
+                                                                                           = */
+                                                                                        fa,
+                                                                                        /* .tensor_split
+                                                                                           = */
+                                                                                        ts,
+                                                                                        /* .use_mmap
+                                                                                           = */
+                                                                                        mmp,
+                                                                                        /* .embeddings
+                                                                                           = */
+                                                                                        embd,
+                                                                                    };
+                                                                                instances.push_back(
+                                                                                    instance);
+                                                                            }
+                                                                        }
+
+    return instances;
+}
\ No newline at end of file
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/llama_forwarder.c b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/llama_forwarder.c
new file mode 100644
index 000000000..b8f5e6b9a
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/llama_forwarder.c
@@ -0,0 +1,171 @@
+// t.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <assert.h>
+#include <curl/curl.h>
+#include <string.h>
+#include <sys/mman.h>
+#include "shm.h"
+
+#define SHM_SIZE (32 * 1024 * 1024)
+
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MAX(a,b) (((a)>(b))?(a):(b))
+
+struct Communication comm;
+
+static size_t
+WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp)
+{
+  size_t realsize = size * nmemb;
+  struct Request *result = (struct Request *)userp;
+ 
+  memcpy(result->text, contents, MIN(realsize, 512));
+ 
+  return realsize;
+}
+
+static long round_up(long n, long mult)
+{
+    return ((n + mult - 1) / mult) * mult;
+}
+
+int init_shm(){
+    int fd = open("/sys/bus/pci/devices/0000:00:10.0/resource2", O_RDWR);
+    assert(fd != -1);
+    
+    long pagesize = sysconf(_SC_PAGESIZE);
+    long shm_size = round_up(sizeof(struct SharedMemory), pagesize);
+    if (ftruncate(fd, shm_size) == -1) {
+        perror("ftruncate");
+        return -1;
+    }
+
+    comm.requestQueue = mmap(0, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+    assert(comm.requestQueue != NULL);
+    assert(comm.requestQueue != MAP_FAILED);
+
+    sem_destroy(&comm.requestQueue->semaphore);
+    close(fd);
+    
+    memset(comm.requestQueue, 0, shm_size);
+    if (sem_init(&comm.requestQueue->semaphore, 1, 0) != 0) {
+        perror("sem_init failed");
+        exit(EXIT_FAILURE);
+    }
+    
+    fd = open("/sys/bus/pci/devices/0000:00:11.0/resource2", O_RDWR);
+    assert(fd != -1);
+    
+    pagesize = sysconf(_SC_PAGESIZE);
+    shm_size = round_up(sizeof(struct SharedMemory), pagesize);
+    if (ftruncate(fd, shm_size) == -1) {
+        perror("ftruncate");
+        return -1;
+    }
+
+    comm.resultQueue = mmap(0, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+    assert(comm.resultQueue != NULL);
+    assert(comm.resultQueue != MAP_FAILED);
+
+    sem_destroy(&comm.resultQueue->semaphore);
+    close(fd);
+    
+    memset(comm.resultQueue, 0, shm_size);
+    if (sem_init(&comm.resultQueue->semaphore, 1, 0) != 0) {
+        perror("sem_init failed");
+        exit(EXIT_FAILURE);
+    }
+    return 0;
+}
+
+int make_post_call(struct Request* request, struct Request* result){
+    CURL *curl;
+    CURLcode res;
+
+    /* In Windows, this inits the Winsock stuff */
+    curl_global_init(CURL_GLOBAL_ALL);
+
+    /* get a curl handle */
+    curl = curl_easy_init();
+    if(curl) {
+        /* First set the URL that is about to receive our POST. This URL can
+         just as well be an https:// URL if that is what should receive the
+         data. */
+        curl_easy_setopt(curl, CURLOPT_URL, "http://localhost:8080/completion");
+        // Set the POST request method
+        curl_easy_setopt(curl, CURLOPT_POST, 1L);
+
+        // Set the HTTP headers
+        struct curl_slist *headers = NULL;
+        headers = curl_slist_append(headers, "Content-Type: application/json");
+        curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+
+        // Set the POST data
+	char *data = malloc(1024 * sizeof(char));
+	sprintf(data, "{\"prompt\": \"%s\",\"n_predict\": %d}", request->text, 10);
+        curl_easy_setopt(curl, CURLOPT_POSTFIELDS, data);
+
+        /* send all data to this function  */
+        curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
+ 
+        /* we pass our 'chunk' struct to the callback function */
+        curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)result);
+
+        /* Perform the request, res gets the return code */
+        res = curl_easy_perform(curl);
+        /* Check for errors */
+        if(res != CURLE_OK)
+          fprintf(stderr, "curl_easy_perform() failed: %s\n",
+                curl_easy_strerror(res));
+
+        /* always cleanup */
+	curl_slist_free_all(headers);
+	free(data);
+        curl_easy_cleanup(curl);
+    }
+    curl_global_cleanup();
+    return 0;
+}
+
+int handle_request(struct Request* request, struct Request* result){
+    printf("%s\n", request->text);
+    fflush(stdout);
+    
+    make_post_call(request, result);
+    //request->done = 1;
+    return 0;
+}
+
+int write_result(struct Request* result){
+    memcpy(comm.resultQueue->requests[comm.requestQueue->counter].text, result->text, 512);
+    sem_post(&comm.resultQueue->semaphore);
+    return 0;
+}
+
+int clean_shm(struct SharedMemory* shm){
+    munmap(shm, SHM_SIZE);
+    return 0;
+}
+
+int main(int argc, char **argv) {
+    init_shm();
+
+    struct Request* result = malloc(sizeof(struct Request));
+    while(1){
+        // Wait client writes
+        if(sem_trywait(&comm.requestQueue->semaphore) == 0){
+            printf("Recieved new request:\n\t%s\n", comm.requestQueue->requests[comm.requestQueue->counter].text);
+            struct Request *request = &comm.requestQueue->requests[comm.requestQueue->counter];
+	    handle_request(request, result);
+	    write_result(result);
+	}	
+    }
+
+    free(result);
+    return 0;
+}
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/llama_server.cpp b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/llama_server.cpp
new file mode 100644
index 000000000..f5e0ed056
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/server/llama_server.cpp
@@ -0,0 +1,647 @@
+#include <argp.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <chrono>
+#include <cstddef>
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <list>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include "args_server.h"
+#include "client_request.h"
+#include "common.h"
+#include "defines.h"
+#include "llama.h"
+#include "llama_bench_args.h"
+#include "misc.h"
+#include "shm.h"
+
+#ifdef DEBUG_FLAG
+#define LOG_MSG(...) printf(__VA_ARGS__) // Or simply LOG_MSG(msg) printf(msg)
+#define LOG_FLUSH(...) printf(__VA_ARGS__)
+#else
+#define LOG_MSG(...)                     // Or LOG_MSG(msg)
+#define LOG_FLUSH(...)
+#endif
+
+int run = 1;
+unsigned int n_threads = 1;
+
+struct Communication comm;
+
+llama_model *model;
+std::unordered_map<int, llama_lora_adapter *> lora_adapters;
+
+int last_processed_request_id = -1;
+int last_processed_request_prio = -1;
+bool context_change_required;
+
+int curr_priority;
+std::atomic<int> highest_avail_prio(
+    NO_PRIORITY);  // will receinve a proper priority on fist scheduled request
+
+enum Policy general_policy;
+bool auto_policy = false;
+
+struct ggml_threadpool *threadpool;
+struct ggml_threadpool *threadpool_batch;
+
+bool use_llama_bench_args = false;
+cmd_params_instance llama_bench_args;
+unsigned int server_total_nb_tokens_generated;
+std::chrono::milliseconds total_context_switch_duration;
+unsigned int nb_context_switches = 0;
+std::chrono::milliseconds total_context_creation_duration;
+unsigned int nb_context_creations = 0;
+
+// Holds number of requests by their id (for debugging)
+std::unordered_map<int, int> client_request_counter;
+
+int round_robin_next[NB_PRIORITIES] = {0};
+
+struct request_entry {
+    int id;
+    ClientRequest request;
+};
+
+std::array<std::list<request_entry>, NB_PRIORITIES> open_reqs;
+std::array<std::mutex, NB_PRIORITIES> open_reqs_mutex;
+
+int prepare_llama_cpp(struct Arguments arguments) {
+    // initialize the model
+    llama_model_params model_params = llama_model_default_params();
+    model_params.n_gpu_layers = 999;
+
+    if (!use_llama_bench_args) {
+        model = llama_load_model_from_file(arguments.model_path.c_str(), model_params);
+    } else {
+        std::cout << "Using llama-bench arguments, however we ignore model specification\n"
+                  << std::endl;
+        model = llama_load_model_from_file(arguments.model_path.c_str(),
+                                           llama_bench_args.to_llama_mparams());
+    }
+
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return 1;
+    }
+
+    // Load lora adapters, they are loaded but not applied.
+    int index = 0;
+    for (const auto &lora_path : arguments.lora_paths) {
+        struct llama_lora_adapter *lora = llama_lora_adapter_init(model, lora_path.c_str());
+        lora_adapters.insert({index++, lora});
+    }
+
+    return 0;
+}
+
+int clean_llama_cpp(void) {
+    llama_free_model(model);
+    return 0;
+}
+
+int request_processor(ClientRequest &request, int tokens_to_gen, float throughput_limit) {
+    throughput_limit = MIN(throughput_limit, request.get_throughput());
+    // init context if it has not already
+    if (request.require_context_switch && request.initialized) {
+        LOG_MSG(stdout, "Loading context to serve a different request...");
+        LOG_FLUSH(stdout);
+
+        auto t1 = std::chrono::high_resolution_clock::now();
+        switch (request.policy) {
+            case Policy::save_load_full:
+                request.init_ctx_from_save_state();
+                break;
+            case Policy::save_load_kv:
+                request.init_from_saved_kv();
+                break;
+            case Policy::recompute:
+                request.init_from_clean_kv();
+                break;
+            default:
+                break;
+        };
+
+        auto t2 = std::chrono::high_resolution_clock::now();
+        auto duration = duration_cast<std::chrono::milliseconds>(t2 - t1);
+        LOG_MSG(stdout, "%s: Context switch in %ld ms\n", __func__, duration.count());
+
+        total_context_switch_duration += duration;
+        nb_context_switches++;
+    } else if (!request.initialized) {
+        LOG_MSG(stdout, "Creating new context to serve a different request...");
+        LOG_FLUSH(stdout);
+
+        auto t1 = std::chrono::high_resolution_clock::now();
+        request.init_llama_context();
+        request.initialized = true;
+
+        auto t2 = std::chrono::high_resolution_clock::now();
+
+        auto duration = duration_cast<std::chrono::milliseconds>(t2 - t1);
+        LOG_MSG(stdout, "%s: Context creation in %ld ms\n", __func__, duration.count());
+        LOG_FLUSH(stdout);
+
+        total_context_creation_duration += duration;
+        nb_context_creations++;
+    }
+
+    // Apply requests lora, does nothing if no lora specified
+    request.apply_lora();
+
+    llama_kv_cache_view_update(request.ctx, &request.kv_view);
+    LOG_MSG(stdout, "KV Cache right after the switch:\n\t");
+#ifdef DEBUG_FLAG
+    request.print_kv_cache();
+#endif
+    std::cout << std::endl;
+    request.require_context_switch = false;
+
+    LOG_MSG(stdout, "[TEO_DEBUG]: Llama context initialized\n");
+
+    const auto t_main_start = ggml_time_us();
+    int n_decode = 0;
+    int n_generated_tokens = 0;
+
+    auto t_per_token_start = ggml_time_us();
+    auto t_per_token_end = t_per_token_start;
+
+    llama_attach_threadpool(request.ctx, threadpool, threadpool_batch);
+    llama_set_n_threads(request.ctx, n_threads, n_threads);
+
+    while (n_generated_tokens < tokens_to_gen && run) {
+        // evaluate the current batch with the transformer model
+        int result_decode = llama_decode(request.ctx, request.batch);
+        if (result_decode != 0) {
+            if (result_decode == 1) {
+                std::cout << "KV cache full! Stopping request..." << std::endl;
+                request.done = true;
+                request.initialized = false;
+                return 0;
+            }
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            exit(1);
+        }
+
+        request.n_past_tokens += request.batch.n_tokens;
+        request.n_past_tokens_mod += request.batch.n_tokens;
+        n_decode += request.batch.n_tokens;
+
+        server_total_nb_tokens_generated++;  // For metrics
+
+        // sample the next token
+        {
+            request.new_token_id = llama_sampler_sample(request.smpl, request.ctx, -1);
+            Request *raw_request = request.get_raw_request();
+
+            // is it an end of generation?
+            if (llama_token_is_eog(model, request.new_token_id)) {
+                request.done = true;
+                request.initialized = false;
+                std::cout << "[Info] End of generation charachter created, we generated less chars "
+                             "than the "
+                             "client requested\n"
+                          << "Request has been fully processed!" << std::endl;
+                break;
+            }
+
+            char buf[128];
+            int n = llama_token_to_piece(model, request.new_token_id, buf, sizeof(buf), 0, true);
+            if (n < 0) {
+                fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
+                return 1;
+            }
+
+            // Write to shm
+            int cur_length = request.get_current_length();
+            for (int a = 0; a < n && request.n_generated_char + a <= request.n_chars_to_gen; a++) {
+                raw_request->text[cur_length + a] = buf[a];
+            }
+
+            std::string s(buf, n);
+            LOG_MSG(stdout, "Generated in this slot:\n  %s", s.c_str());
+            LOG_FLUSH(stdout);
+
+            request.n_generated_char += n;
+
+            if (request.n_generated_char >= request.n_chars_to_gen) {
+                request.done = true;
+                request.initialized = false;
+                std::cout << "Request has been fully processed!" << std::endl;
+            }
+
+            // TODO: Do these make sense after batch is done or should we move those up?
+            // prepare the next batch with the sampled token
+            common_batch_clear(request.batch);
+            common_batch_add(request.batch, request.new_token_id, request.n_past_tokens, {0}, true);
+
+            // Save generated tokens so that we can recover after a KV cache flush
+            request.prompt_tokens.push_back(request.new_token_id);
+
+#ifdef DEBUG_FLAG
+            // Update kv_cache view (debug)
+            llama_kv_cache_view_update(request.ctx, &request.kv_view);
+            request.print_kv_cache();
+#endif
+        }
+
+        t_per_token_end = ggml_time_us();
+        float token_throughput = 1 / ((t_per_token_end - t_per_token_start) / 1000000.0f);
+        LOG_MSG(stdout, "Token throughput: %f\n", token_throughput);
+        if (token_throughput > throughput_limit) {
+            auto t_main_end_new = (1 * 1000000.0 / throughput_limit) + t_per_token_start;
+            int sleep_time_us = t_main_end_new - ggml_time_us();
+            LOG_MSG(stdout, "Sleeping for %d us\n", sleep_time_us);
+            usleep(sleep_time_us);
+        }
+        t_per_token_end = ggml_time_us();
+        token_throughput = 1 / ((t_per_token_end - t_per_token_start) / 1000000.0f);
+        LOG_MSG(stdout, "New Token throughput: %f\n", token_throughput);
+        t_per_token_start = t_per_token_end;
+
+        n_generated_tokens++;
+        if (request.n_generated_char >= request.n_chars_to_gen) {
+            break;
+        }
+
+        // lower numbers are higher priority
+        if (highest_avail_prio.load() < curr_priority) {
+            printf("Higher priority request detected! Preempting current one!");
+            break;
+        }
+    }
+
+    LOG_MSG(stdout, "\n");
+    LOG_MSG(stdout, "n_decode: %d\n", n_decode);
+    auto t_main_end = ggml_time_us();
+    float throughput = n_decode / ((t_main_end - t_main_start) / 1000000.0f);
+
+    LOG_MSG(stdout, "Throughput: %f\n", throughput);
+    LOG_MSG(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode,
+            (t_main_end - t_main_start) / 1000000.0f, throughput);
+
+    // Temp stop point for benchmarks
+    if (request.limit_hit == false && request.n_past_tokens_mod > KV_CACHE_LIMIT) {
+        request.limit_hit = true;
+        request.n_past_tokens_mod = 0;
+        std::cout << "KV_CACHE_LIMIT hit, sleeping for 5 seconds" << std::endl;
+        // sleep(5);
+    }
+
+    if (auto_policy) {
+        request.auto_set_kv_policy();
+    }
+
+#ifdef DEBUG_FLAG
+    llama_perf_sampler_print(request.smpl);
+    llama_perf_context_print(request.ctx);
+    fprintf(stderr, "\n");
+#endif
+    return 0;
+}
+
+int handle_request(ClientRequest &request, int tokens_to_gen, float throughput_limit) {
+    request_processor(request, tokens_to_gen, throughput_limit);
+    LOG_MSG(stdout, "[TEO_DEBUG]: Finished processing request\n");
+    return 0;
+}
+
+void scan_control(void) {
+    while (1) {
+        // Get the current semaphore value
+        if (sem_getvalue(&comm.requestQueue->run, &run) == -1) {
+            perror("sem_getvalue failed");
+            exit(EXIT_FAILURE);
+        }
+
+        if (run == 0) {
+            break;
+        }
+    }
+}
+
+void scan_requests(void) {
+    // This is not ideal and should be 'sem_wait' instead.
+    // However, sem_wait does not seem to work when the semaphore is
+    // inside of a shared memory, used by two QEMU VMs.
+    while (sem_trywait(&comm.requestQueue->active_reqs) != 0) {
+        if (run == 0) {
+            return;
+        }
+    }
+
+    for (int i = 0; i < MAX_REQUESTS; i++) {
+        if (sem_trywait(&comm.requestQueue->requests[i].serverNotifier) == 0) {
+            ClientRequest request{&comm.requestQueue->requests[i],
+                                  std::string{comm.requestQueue->requests[i].text},
+                                  comm.requestQueue->requests[i].n_chars_to_gen,
+                                  comm.requestQueue->requests[i].id,
+                                  comm.requestQueue->requests[i].prio,
+                                  comm.requestQueue->requests[i].lora};
+            open_reqs_mutex[request.prio].lock();
+            open_reqs[request.prio].emplace_back(request.id, request);
+
+            // lower numbers have higher priority
+            if (request.prio < highest_avail_prio.load())
+                highest_avail_prio.store(request.prio);
+
+            open_reqs_mutex[request.prio].unlock();
+
+            // Update the number of request we received from this specific
+            // client
+            if (client_request_counter.count(request.id)) {
+                client_request_counter[request.id]++;
+            } else {
+                client_request_counter[request.id] = 1;
+            }
+        }
+    }
+}
+
+void remove_completed_reqs(void) {
+    for (int i = 0; i < NB_PRIORITIES; i++) {
+        int j = 0;
+        open_reqs_mutex[i].lock();
+        for (auto req_iterator = open_reqs[i].begin(); req_iterator != open_reqs[i].end();) {
+            if (req_iterator->request.done) {
+                auto request = req_iterator->request;
+                // free the resources
+                llama_synchronize(request.ctx);
+                llama_kv_cache_clear(request.ctx);
+                llama_kv_cache_view_update(request.ctx, &request.kv_view);
+                llama_batch_free(request.batch);
+                llama_sampler_free(request.smpl);
+                llama_free(request.ctx);
+                if (request.state_save) {
+                    free(request.state_save);
+                    request.state_save = NULL;
+                }
+                sem_post(&request.get_raw_request()->clientNotifier);
+
+                // remove the element and keep round robin index consistent
+                if (j < round_robin_next[i])
+                    round_robin_next[i]--;
+                req_iterator = open_reqs[i].erase(req_iterator);
+            } else {
+                ++req_iterator;
+            }
+        }
+        open_reqs_mutex[i].unlock();
+    }
+}
+
+// Assumes there are requests in the queue
+ClientRequest &schedule_next(void) {
+    // assert(reqs_available() == true);
+    size_t i = 0;
+    for (; i < open_reqs.size(); i++) {
+        open_reqs_mutex[i].lock();
+        if (!open_reqs[i].empty()) {
+            open_reqs_mutex[i].unlock();
+            break;
+        }
+        open_reqs_mutex[i].unlock();
+    }
+
+    if (i >= open_reqs.size()) {
+        throw std::out_of_range("No element could be found");
+    }
+
+    open_reqs_mutex[i].lock();
+
+    auto req_it = open_reqs[i].begin();
+    std::advance(req_it, round_robin_next[i]);
+    ClientRequest &req = req_it->request;
+
+    if (last_processed_request_id != -1 && last_processed_request_id != req.id) {
+        req.require_context_switch = true;
+        // find the last request
+        auto it =
+            std::find_if(open_reqs[last_processed_request_prio].begin(),
+                         open_reqs[last_processed_request_prio].end(),
+                         [](const request_entry &r) { return r.id == last_processed_request_id; });
+        // if found, suspend this request. It will get cleaned by the scanner thread.
+        if (it != open_reqs[last_processed_request_prio].end()) {
+            std::cout << "Freeing context to serve a different request..." << std::endl;
+            auto t1 = std::chrono::high_resolution_clock::now();
+            switch (it->request.policy) {
+                case Policy::save_load_full: {
+                    // Save the state
+                    std::cout << "\tSaving context before free..." << std::endl;
+
+                    const size_t ctx_size = llama_get_state_size(it->request.ctx);
+                    it->request.state_save = (uint8_t *)malloc(ctx_size);
+                    llama_state_get_data(it->request.ctx, it->request.state_save, ctx_size);
+                    break;
+                }
+                case Policy::save_load_kv: {
+                    // Save the kv_cache
+                    std::cout << "\tSaving kv cache before free..." << std::endl;
+
+                    it->request.kv_cache_size = llama_state_seq_get_size(it->request.ctx, 0);
+                    it->request.kv_cache_backup = (uint8_t *)malloc(it->request.kv_cache_size);
+                    llama_state_seq_get_data(it->request.ctx, it->request.kv_cache_backup,
+                                             it->request.kv_cache_size, 0);
+                    break;
+                }
+                default:
+                    break;
+            };
+
+            llama_kv_cache_clear(it->request.ctx);
+            llama_kv_cache_view_update(it->request.ctx, &it->request.kv_view);
+            // save load state does not cover the batch
+            llama_batch_free(it->request.batch);
+            llama_sampler_free(it->request.smpl);
+            llama_free(it->request.ctx);
+
+            auto t2 = std::chrono::high_resolution_clock::now();
+
+            auto duration = duration_cast<std::chrono::milliseconds>(t2 - t1);
+            LOG_MSG(stdout, "%s: Context save in %ld ms\n", __func__, duration.count());
+        }
+    }
+
+    last_processed_request_id = req.id;
+    last_processed_request_prio = req.prio;
+
+    round_robin_next[i]++;
+
+    req_it = open_reqs[i].begin();
+    std::advance(req_it, round_robin_next[i]);
+    if (req_it == open_reqs[i].end()) {
+        round_robin_next[i] = 0;
+    }
+
+    curr_priority = req.prio;
+    // see if I want to lock more
+    // to avoid overriding a higher priority value that came in the meanwhile
+    highest_avail_prio.store(req.prio);
+
+    open_reqs_mutex[i].unlock();
+
+    if (!req.done)
+        return req;
+    else
+        throw std::out_of_range("No element could be found");
+}
+
+bool reqs_available(void) {
+    for (int i = 0; i < NB_PRIORITIES; i++) {
+        open_reqs_mutex[i].lock();
+        if (!open_reqs[i].empty()) {
+            LOG_MSG(stdout, "[TEO_DEBUG] There is a new request!\n");
+            open_reqs_mutex[i].unlock();
+            return true;
+        }
+        open_reqs_mutex[i].unlock();
+    }
+    return false;
+}
+
+void update_req_list() {
+    while (run) {
+        scan_requests();
+    }
+}
+
+int main(int argc, char **argv) {
+    // init_shm(true, comm);
+
+    Arguments arguments;
+    arguments.state_save = false;
+    arguments.throughput_limit = 1000;
+    arguments.tokens_to_gen = 10;
+
+    // Parse arguments
+    argp_parse(&argp, argc, argv, 0, 0, &arguments);
+
+    auto_policy = arguments.auto_policy;
+    n_threads = arguments.n_threads;
+
+    if (!arguments.sig_file.empty()) {
+        std::ofstream signal_file(arguments.sig_file.c_str());
+        signal_file << "signal" << std::endl;
+        signal_file.close();
+    }
+
+    // Parse llama-bench style arguments if exists
+    if (arguments.llama_bench_args) {
+        cmd_params params = cmd_params_defaults;
+        std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
+        std::cout << "Using llama-bench arguments for model and context creation...\n" << std::endl;
+        use_llama_bench_args = true;
+        llama_bench_args = params_instances[0];
+    }
+
+    // Set the default policy for all requests.
+    if (arguments.state_save) {
+        general_policy = Policy::save_load_full;
+    } else if (arguments.kv_save) {
+        general_policy = Policy::save_load_kv;
+    } else {
+        general_policy = Policy::recompute;
+    }
+
+    init_shm(true, comm, arguments.shm_location);
+    ggml_time_init();
+
+    struct ggml_threadpool_params tpp_batch = ggml_threadpool_params_default(n_threads);
+    struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
+
+    set_process_priority(GGML_SCHED_PRIO_NORMAL);
+
+    threadpool_batch = NULL;
+    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
+        threadpool_batch = ggml_threadpool_new(&tpp_batch);
+        if (!threadpool_batch) {
+            return 1;
+        }
+
+        // Start the non-batch threadpool in the paused state
+        tpp.paused = true;
+    }
+
+    threadpool = ggml_threadpool_new(&tpp);
+    if (!threadpool) {
+        return 1;
+    }
+
+    prepare_llama_cpp(arguments);
+    // OPTIMIZE: No need for 2 passes to check if empty and to schedule next
+
+    std::thread req_list_updater{update_req_list};
+    std::thread control_scanner{scan_control};
+
+    printf("Server accepting requests now!\n");
+    const auto t_main_start = ggml_time_us();
+    while (run) {
+        if(arguments.token_limit < server_total_nb_tokens_generated){
+            break;
+        }
+        remove_completed_reqs();
+        if (!reqs_available()) {
+            continue;
+        }
+        LOG_MSG(stdout, "[TEO_DEBUG]: Get next request to be processes\n");
+        ClientRequest &request = schedule_next();
+        LOG_MSG(stdout, "[TEO_DEBUG]: Selected request %d that wants to generate %d characters\n",
+               request.id, request.n_chars_to_gen);
+        handle_request(request, arguments.tokens_to_gen, arguments.throughput_limit);
+    }
+    const auto t_main_end = ggml_time_us();
+    std::cout << "Shutdown request received, shutting down the server...";
+
+    if (!arguments.performance_metrics_file.empty()) {
+        float throughput =
+            server_total_nb_tokens_generated / ((t_main_end - t_main_start) / 1000000.0f);
+
+        std::chrono::duration<long int, std::ratio<1, 1000>> avg_ctx_creation_duration{0};
+        if (nb_context_creations != 0) {
+            avg_ctx_creation_duration = total_context_creation_duration / nb_context_creations;
+        }
+
+        std::chrono::duration<long int, std::ratio<1, 1000>> avg_ctx_switch_duration{0};
+        if (nb_context_switches != 0) {
+            avg_ctx_switch_duration = total_context_switch_duration / nb_context_switches;
+        }
+
+        std::ofstream metric_file(arguments.performance_metrics_file.c_str());
+        metric_file << "Tokens generated: " << server_total_nb_tokens_generated << "\n"
+                    << "Time spent: " << (t_main_end - t_main_start) / 1000000.0f << "\n"
+                    << "Throughput (t/s): " << throughput << "\n"
+                    << "Time spent on context creation: " << total_context_creation_duration << "\n"
+                    << "Number of context creations: " << nb_context_creations << "\n"
+                    << "Average context creation time: " << avg_ctx_creation_duration << "\n"
+                    << "Time spent on context switches: " << total_context_switch_duration << "\n"
+                    << "Number of context switches: " << nb_context_switches << "\n"
+                    << "Average context switch time: " << avg_ctx_switch_duration << std::endl;
+        metric_file.close();
+    }
+
+    if (!arguments.sig_file.empty()) {
+        std::remove(arguments.sig_file.c_str());
+    }
+
+    req_list_updater.join();
+    control_scanner.join();
+
+    ggml_threadpool_free(threadpool);
+    ggml_threadpool_free(threadpool_batch);
+
+    clean_llama_cpp();
+    // free(result);
+    return 0;
+}
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/shm.cpp b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/shm.cpp
new file mode 100644
index 000000000..780a3861a
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/intervm_comm/src/shm.cpp
@@ -0,0 +1,81 @@
+#include "shm.h"
+
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "misc.h"
+
+int init_shm(bool create, Communication &comm, std::string optional_shm_location) {
+    int fd;
+    if (!optional_shm_location.empty()) {
+        fd = open(optional_shm_location.c_str(), O_RDWR);
+        assert(fd != -1);
+    } else {
+        fd = open(SHM_LOCATION, O_RDWR);
+        assert(fd != -1);
+    }
+
+    long pagesize = sysconf(_SC_PAGESIZE);
+    long shm_size = round_up(sizeof(struct SharedMemory), pagesize);
+    if (ftruncate(fd, shm_size) == -1) {
+        perror("ftruncate");
+        return -1;
+    }
+
+    comm.requestQueue =
+        (SharedMemory *)mmap(0, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+    assert(comm.requestQueue != NULL);
+    assert(comm.requestQueue != MAP_FAILED);
+
+    close(fd);
+
+    if (!create) {
+        // only connect to shared memory, do not set up semaphores and mutextes
+        return 0;
+    }
+
+    memset(comm.requestQueue, 0, shm_size);
+
+    // Init mutexes
+    pthread_mutexattr_t attr;
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
+
+    sem_destroy(&comm.requestQueue->active_reqs);
+    if (sem_init(&comm.requestQueue->active_reqs, 1, 0) != 0) {
+        perror("sem_init failed");
+        exit(EXIT_FAILURE);
+    }
+
+    sem_destroy(&comm.requestQueue->run);
+    if (sem_init(&comm.requestQueue->run, 1, 1) != 0) {  // init to 1 since server will start
+        perror("sem_init failed");
+        exit(EXIT_FAILURE);
+    }
+
+    for (int i = 0; i < MAX_REQUESTS; i++) {
+        pthread_mutex_init(&comm.requestQueue->requests[i].mutex, &attr);
+        sem_destroy(&comm.requestQueue->requests[i].clientNotifier);
+        if (sem_init(&comm.requestQueue->requests[i].clientNotifier, 1, 0) != 0) {
+            perror("sem_init failed");
+            exit(EXIT_FAILURE);
+        }
+        sem_destroy(&comm.requestQueue->requests[i].serverNotifier);
+        if (sem_init(&comm.requestQueue->requests[i].serverNotifier, 1, 0) != 0) {
+            perror("sem_init failed");
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    return 0;
+}
+
+int clean_shm(struct SharedMemory *shm) {
+    munmap(shm, SHM_SIZE);
+    return 0;
+}
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/nvidia.nix b/archive/2025/summer/msc_berkay_eren_ueruen/nvidia.nix
new file mode 100644
index 000000000..98f3cca0d
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/nvidia.nix
@@ -0,0 +1,67 @@
+{ pkgs, config, ... }:
+{
+  nixpkgs.config.allowUnfree = true;
+  nixpkgs.config.nvidia.acceptLicense = true;
+  # enable the nvidia driver
+  services.xserver.videoDrivers = [ "nvidia" ];
+  hardware.opengl.enable = true;
+  hardware.nvidia.datacenter.enable = true;
+  hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.dc_535;
+
+  hardware.nvidia.open = true;
+
+  virtualisation.docker.enable = true;
+  hardware.nvidia-container-toolkit.enable = true;
+  hardware.opengl.driSupport32Bit = true;
+
+  virtualisation.memorySize = 64 * 1024;
+  virtualisation.diskSize = 128 * 1024;
+  # Nix Store must be persistent across all QEMU
+  # executions
+  virtualisation.writableStoreUseTmpfs = false;
+
+  virtualisation.qemu.options = [
+    #"-name NIXVM,debug-threads=on"
+    "-enable-kvm"
+    "-cpu host"
+    "-device vfio-pci,host=ca:00.0"
+    # addr must match the address in llama_server.cpp and write.c:
+    # int fd = open("/sys/bus/pci/devices/0000:00:12.0/resource2", O_RDWR);
+    # or if the addr were 0x11 then:
+    # int fd = open("/sys/bus/pci/devices/0000:00:11.0/resource2", O_RDWR);
+    # It is worth mentioning that for a two VM test setup addr must be same for both VMs.
+    # At the same time, for any other VMs outside of the test setup, addr must
+    # be different. So I generally just change the addr locally before launching any VMs.
+    "-device ivshmem-plain,memdev=shm1,bus=pci.0,addr=0x12,master=on"
+    "-object memory-backend-file,size=32M,share=on,mem-path=/dev/shm/shm1,id=shm1"
+    "-smp 32,sockets=1,cores=32,threads=1,maxcpus=32"
+    "-m 64G"
+  ];
+
+  nixos-shell.mounts.extraMounts = {
+    # override options for each mount
+    "/llm-os" = {
+      target = ../.;
+      cache = "none";
+    };
+
+  };
+  nixos-shell.mounts = {
+    mountHome = false;
+    mountNixProfile = false;
+    cache = "none"; # default is "loose"
+  };
+
+
+  environment.systemPackages = [
+  	config.boot.kernelPackages.nvidiaPackages.dc_535
+#	config.boot.kernelPackages.nvidia_x11
+	pkgs.git
+	pkgs.neovim
+	pkgs.cudaPackages.cuda_cudart.stubs
+	pkgs.cudatoolkit
+	pkgs.vim
+	];
+}
+
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/16-clients-long.py b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/16-clients-long.py
new file mode 100644
index 000000000..d72e12421
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/16-clients-long.py
@@ -0,0 +1,27 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.interpolate import make_interp_spline
+import matplotlib.pyplot as plt
+
+# context size
+x_axis = np.array([641, 960, 1211, 1391, 1591])
+
+# ms/token
+vm_cpu = np.array([1.02924, 0.79405, 0.663664, 0.573927, 0.525171])
+
+# smooth out curves
+#x_axis_new = np.linspace(x_axis.min(), x_axis.max(), 500)
+
+plt.title("Average Throughput by the Total Number of Generated\nTokens with 16 Clients (Same Priority Requests)")
+
+#plt.title("LLM-OS overhead")
+plt.xlabel("Total Number of Tokens Generated for 16 Clients")
+plt.ylabel("Throughput (tokens per second)")
+a = plt.plot(x_axis, vm_cpu, label="CPU Inference", color=('#b9a050'))
+
+plt.ylim([0,2])
+plt.legend()
+
+plt.savefig('16-clients-long.png', format='png', dpi=1200)
+#plt.show()
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/context_switch.py b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/context_switch.py
new file mode 100644
index 000000000..72d4fed26
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/context_switch.py
@@ -0,0 +1,32 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.interpolate import make_interp_spline
+import matplotlib.pyplot as plt
+
+# context size
+x_axis = np.array([100, 300, 500, 1000, 1500, 2048, 4000])
+
+# ms/token
+#kv_reset = np.array([18.6,22.6,26.6,41,55, 73, 1460])
+kv_reset = np.array([51.6,97.6,140.6,278,428.3, 599.5, 1304])
+state_restore = np.array([252+195.3, 249.3+187.3, 280+207.3, 305+209.3, 341+228, 382+221, 521+260]) # avg save time + avg load time
+
+# smooth out curves
+#x_axis_new = np.linspace(x_axis.min(), x_axis.max(), 500)
+
+
+
+#plt.title("LLM-OS overhead")
+plt.title("Context switch duration by number of tokens\nLower is better ↓", fontsize=9, color="navy", weight="bold")
+plt.xlabel("Number of tokens in each request")
+plt.ylabel("Context switch duration (ms)")
+plt.plot(x_axis, kv_reset, label = 'KV Recompute')
+plt.plot(x_axis, state_restore, label = 'KV Save/Load (KV Cache)')
+plt.legend()
+plt.grid()
+x_ticks = np.linspace(0, 4000, 5, dtype=int)
+plt.xticks(x_ticks, (str(i) for i in x_ticks))
+
+plt.savefig('context_switch_ctx_4096.png', format='png', dpi=1200)
+#plt.show()
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/context_switch_combi.py b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/context_switch_combi.py
new file mode 100644
index 000000000..c6e891c39
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/context_switch_combi.py
@@ -0,0 +1,37 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.interpolate import make_interp_spline
+import matplotlib.pyplot as plt
+
+# context size
+x_axis = np.array([128, 512, 1024, 2048, 3000, 3500, 4096])
+
+# ms/token
+#kv_reset = np.array([6,19.3,34.6,51.6,97,1170, 1460])
+kv_reset = np.array([33,127,268,569,880,1062, 1460])
+state_restore = np.array([228+161.6, 261+180.6,300+199.3, 600, 675, 474+250, 521+260]) # avg save time + avg load time
+
+kv_reset_4096 = np.array([51.6,97.6,140.6,278,428.3, 599.5, 1304])
+state_restore_4096 = np.array([252+195.3, 249.3+187.3, 280+207.3, 305+209.3, 341+228, 400+297, 521+260]) # avg save time + avg load time
+
+# smooth out curves
+#x_axis_new = np.linspace(x_axis.min(), x_axis.max(), 500)
+
+
+
+#plt.title("LLM-OS overhead")
+plt.title("Context switch duration with fully utilized KV cache\nLower is better ↓", fontsize=9, color="navy", weight="bold")
+plt.xlabel("KV cache capacity")
+plt.ylabel("Context switch duration (ms)")
+plt.plot(x_axis, kv_reset, label = 'KV recompute')
+plt.plot(x_axis, state_restore, label = 'State save/load')
+plt.plot(x_axis, kv_reset_4096, label = 'KV recompute 4096')
+plt.plot(x_axis, state_restore_4096, label = 'State save/load 4096')
+plt.legend()
+plt.grid()
+x_ticks = np.linspace(0, 4000, 5, dtype=int)
+plt.xticks(x_ticks, (str(i) for i in x_ticks))
+
+plt.savefig('context_switch_combi.png', format='png', dpi=1200)
+#plt.show()
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/context_switch_ctx.py b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/context_switch_ctx.py
new file mode 100644
index 000000000..94f1f2083
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/context_switch_ctx.py
@@ -0,0 +1,32 @@
+
+import numpy as np
+from scipy.interpolate import make_interp_spline
+import matplotlib.pyplot as plt
+
+# context size
+x_axis = np.array([128, 512, 1024, 2048, 3000, 3500, 4096])
+
+# ms/token
+#kv_reset = np.array([6,19.3,34.6,51.6,97,1170, 1460])
+kv_reset = np.array([33,127,268,569,880,1062, 1460])
+state_restore = np.array([228+161.6, 261+180.6,300+199.3, 600, 675, 474+250, 521+260]) # avg save time + avg load time
+
+# smooth out curves
+#x_axis_new = np.linspace(x_axis.min(), x_axis.max(), 500)
+
+
+
+#plt.title("LLM-OS overhead")
+plt.title("Context switch duration with fully utilized KV cache\nLower is better ↓")
+plt.xlabel("KV cache capacity (tokens)")
+plt.ylabel("Context switch duration (ms)")
+plt.plot(x_axis, kv_reset, label = 'KV recompute', color=('#57978e'), linewidth=2)
+plt.plot(x_axis, state_restore, label = 'State save/load', color=('#b9a050'), linewidth=2)
+plt.legend()
+plt.grid()
+x_ticks = np.linspace(0, 4000, 5, dtype=int)
+plt.xticks(x_ticks, (str(i) for i in x_ticks))
+
+plt.savefig('context_switch_by_ctx.png', format='png', dpi=1200)
+#plt.show()
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/host_effect.py b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/host_effect.py
new file mode 100644
index 000000000..d0006adec
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/host_effect.py
@@ -0,0 +1,71 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.interpolate import make_interp_spline
+import matplotlib.pyplot as plt
+
+# context size
+x_axis_gpu = np.array(["No LLM-OS\n Service", 1, 10, 20, 30, 40, "No Limit"])
+#x_axis_cpu = np.array([4, 3, 2, 1])
+x_axis_cpu = np.array(["No LLM-OS\n Service", 0.5, 1, 1.5, 2, "No Limit"])
+
+# ms/token
+#kv_reset = np.array([18.6,22.6,26.6,41,55, 73, 1460])
+#data_gpu = np.array([10814699040 , 10251175665 , 10415925995 , 10437800165])
+#data_gpu = np.array([9433416625 , 10475651855 , 10809967645 , 9971406935])
+data_gpu = np.array([0, 10813820552, 10831253104, 10813706132, 10810291586, 10831513788, 0])
+#data_cpu = np.array([8607574140 , 10037069885, 10246498645 , 9965576165])
+#data_cpu = np.array([10179859600 , 10265992510, 9909305865 , 9729481400])
+#data_cpu = np.array([10388914660, 10481455910,10455820540, 10708577720])
+data_cpu = np.array([0,10660883102,10550895312,10418828036, 10321112206, 0])
+
+#base = np.full((4,1), 11588837277 / 1024 / 1024)
+#base = np.full((4,1), 10860600075 / 1024 / 1024)
+base_gpu = [11553883234 / 1024 / 1024, 0, 0, 0, 0, 0, 0]
+base_cpu = [11553883234 / 1024 / 1024, 0, 0, 0, 0, 0]
+#no_limit_gpu = np.full((4,1), 9377212543 / 1024 / 1024)
+#no_limit_gpu = np.full((4,1), 8968689250 / 1024 / 1024)
+no_limit_gpu = np.array([0,0,0,0,0,0,10805066652 / 1024 / 1024])
+#no_limit_cpu = np.full((4,1), 9108544705 / 1024 / 1024)
+#no_limit_cpu = np.full((4,1), 10084521855 / 1024 / 1024)
+no_limit_cpu = np.array([0, 0, 0, 0, 0, 9941705452 / 1024 / 1024])
+
+data_gpu = data_gpu / 1024 /1024
+data_cpu = data_cpu / 1024 /1024
+
+fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(12, 6))
+fig.suptitle('Client side OpenSSL SHA256 performance by inference server throughput limit')
+
+# smooth out curves
+#x_axis_new = np.linspace(x_axis.min(), x_axis.max(), 500)
+
+
+#plt.title("LLM-OS overhead")
+fig.supxlabel("Throughput Limit (tokens per second)")
+fig.supylabel("OpenSSL Performance (megabytes per second)")
+
+ax1.set_ylim([7000,12000])
+ax2.set_ylim([7000,12000])
+
+a = ax1.bar(x_axis_gpu, data_gpu, label = 'Throughput Limit', color=('#A0CA9C'))
+ax1.bar_label(a, label_type='edge', fmt=int)
+a = ax1.bar(x_axis_gpu, base_gpu, label = 'No LLM-OS Service', linestyle='dashed', color=('#A897F5'))
+ax1.bar_label(a, label_type='edge', fmt=int)
+a = ax1.bar(x_axis_gpu, no_limit_gpu, label = 'No Throughput Limit', linestyle='dashed', color=('#747474'))
+ax1.bar_label(a, label_type='edge', fmt=int)
+ax1.set_title("GPU")
+
+
+a = ax2.bar(x_axis_cpu, data_cpu, label = 'Throughput Limit', color=('#F3D683'))
+ax2.bar_label(a, label_type='edge', fmt=int)
+a = ax2.bar(x_axis_cpu, base_cpu, label = 'No LLM-OS Service', linestyle='dashed', color=('#A897F5'))
+ax2.bar_label(a, label_type='edge', fmt=int)
+a = ax2.bar(x_axis_cpu, no_limit_cpu, label = 'No Throughput Limit', linestyle='dashed', color=('#747474'))
+ax2.bar_label(a, label_type='edge', fmt=int)
+ax2.set_title("CPU")
+
+ax1.legend(loc='lower right')
+ax2.legend(loc='lower right')
+
+plt.savefig('host-effect.png', format='png', dpi=1200)
+#plt.show()
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/host_effect_seperat.py b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/host_effect_seperat.py
new file mode 100644
index 000000000..86864e429
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/host_effect_seperat.py
@@ -0,0 +1,94 @@
+import matplotlib.pyplot as plt
+import matplotlib
+import numpy as np
+from scipy.interpolate import make_interp_spline
+
+# context size
+x_axis_gpu = np.array(["No LLM-OS\n Service", 1, 10, 20, 30, 40, "No Limit"])
+x_axis_cpu = np.array(["No LLM-OS\n Service", 0.5, 1, 1.5, 2, "No Limit"])
+
+# Data for GPU plot
+data_gpu = np.array([0, 10813820552, 10831253104, 10813706132, 10810291586, 10831513788, 0])
+base_gpu = [11553883234 / 1024 / 1024, 0, 0, 0, 0, 0, 0]
+no_limit_gpu = np.array([0, 0, 0, 0, 0, 0, 10805066652 / 1024 / 1024])
+
+# Data for CPU plot
+data_cpu = np.array([0, 10660883102, 10550895312, 10418828036, 10321112206, 0])
+base_cpu = [11553883234 / 1024 / 1024, 0, 0, 0, 0, 0]
+no_limit_cpu = np.array([0, 0, 0, 0, 0, 9941705452 / 1024 / 1024])
+
+# Convert to MB/s
+data_gpu = data_gpu / 1024 / 1024
+data_cpu = data_cpu / 1024 / 1024
+
+# Create and save GPU plot
+fig_gpu = plt.figure(figsize=(7, 6))
+ax_gpu = fig_gpu.add_subplot(111)
+ax_gpu.set_title("GPU")
+ax_gpu.set_xlabel("Throughput Limit (tokens per second)")
+ax_gpu.set_ylabel("OpenSSL Performance (megabytes per second)")
+ax_gpu.set_ylim([7000, 12000])
+
+a = ax_gpu.bar(x_axis_gpu, data_gpu, label='Throughput Limit', color='#A0CA9C')
+ax_gpu.bar_label(a, label_type='edge', fmt=int)
+a = ax_gpu.bar(x_axis_gpu, base_gpu, label='No LLM-OS Service', linestyle='dashed', color='#A897F5')
+ax_gpu.bar_label(a, label_type='edge', fmt=int)
+a = ax_gpu.bar(x_axis_gpu, no_limit_gpu, label='No Throughput Limit', linestyle='dashed', color='#747474')
+ax_gpu.bar_label(a, label_type='edge', fmt=int)
+ax_gpu.legend(loc='lower right')
+plt.tight_layout()
+plt.savefig('host-effect-gpu.png', format='png', dpi=1200)
+plt.close(fig_gpu)
+
+# Create and save CPU plot
+fig_cpu = plt.figure(figsize=(7, 6))
+ax_cpu = fig_cpu.add_subplot(111)
+ax_cpu.set_title("CPU")
+ax_cpu.set_xlabel("Throughput Limit (tokens per second)")
+ax_cpu.set_ylabel("OpenSSL Performance (megabytes per second)")
+ax_cpu.set_ylim([7000, 12000])
+
+a = ax_cpu.bar(x_axis_cpu, data_cpu, label='Throughput Limit', color='#F3D683')
+ax_cpu.bar_label(a, label_type='edge', fmt=int)
+a = ax_cpu.bar(x_axis_cpu, base_cpu, label='No LLM-OS Service', linestyle='dashed', color='#A897F5')
+ax_cpu.bar_label(a, label_type='edge', fmt=int)
+a = ax_cpu.bar(x_axis_cpu, no_limit_cpu, label='No Throughput Limit', linestyle='dashed', color='#747474')
+ax_cpu.bar_label(a, label_type='edge', fmt=int)
+ax_cpu.legend(loc='lower right')
+plt.tight_layout()
+plt.savefig('host-effect-cpu.png', format='png', dpi=1200)
+plt.close(fig_cpu)
+
+# If you still want the combined figure as well
+fig_combined, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
+fig_combined.suptitle('Client side OpenSSL SHA256 performance by inference server throughput limit')
+fig_combined.supxlabel("Throughput Limit (tokens per second)")
+fig_combined.supylabel("OpenSSL Performance (megabytes per second)")
+
+ax1.set_ylim([7000, 12000])
+ax2.set_ylim([7000, 12000])
+
+# GPU plot (left)
+a = ax1.bar(x_axis_gpu, data_gpu, label='Throughput Limit', color='#A0CA9C')
+ax1.bar_label(a, label_type='edge', fmt=int)
+a = ax1.bar(x_axis_gpu, base_gpu, label='No LLM-OS Service', linestyle='dashed', color='#A897F5')
+ax1.bar_label(a, label_type='edge', fmt=int)
+a = ax1.bar(x_axis_gpu, no_limit_gpu, label='No Throughput Limit', linestyle='dashed', color='#747474')
+ax1.bar_label(a, label_type='edge', fmt=int)
+ax1.set_title("GPU")
+
+# CPU plot (right)
+a = ax2.bar(x_axis_cpu, data_cpu, label='Throughput Limit', color='#F3D683')
+ax2.bar_label(a, label_type='edge', fmt=int)
+a = ax2.bar(x_axis_cpu, base_cpu, label='No LLM-OS Service', linestyle='dashed', color='#A897F5')
+ax2.bar_label(a, label_type='edge', fmt=int)
+a = ax2.bar(x_axis_cpu, no_limit_cpu, label='No Throughput Limit', linestyle='dashed', color='#747474')
+ax2.bar_label(a, label_type='edge', fmt=int)
+ax2.set_title("CPU")
+
+ax1.legend(loc='lower right')
+ax2.legend(loc='lower right')
+
+plt.tight_layout()
+plt.savefig('host-effect-combined.png', format='png', dpi=1200)
+plt.close(fig_combined)
\ No newline at end of file
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/implementation_overhead.py b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/implementation_overhead.py
new file mode 100644
index 000000000..eecd6edaf
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/implementation_overhead.py
@@ -0,0 +1,41 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.interpolate import make_interp_spline
+import matplotlib.pyplot as plt
+
+# context size
+x_axis = np.array(['llama-bench\n(Native)', 'llama-bench\n(VM)', 'LLM-OS Kernel'])
+
+# ms/token
+#kv_reset = np.array([18.6,22.6,26.6,41,55, 73, 1460])
+data_gpu = np.array([93.985399, 92.628699, 83.4922])
+data_cpu = np.array([3.180029, 3.118043, 3.07812])
+
+# smooth out curves
+#x_axis_new = np.linspace(x_axis.min(), x_axis.max(), 500)
+
+print(data_gpu[0] / data_gpu[2] * 100)
+print(data_cpu[0] / data_cpu[2] * 100)
+
+fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(10, 5))
+fig.suptitle('Inference overhead of our implementation with a single client generating 512 tokens')
+
+#plt.title("LLM-OS overhead")
+fig.supylabel("Throughput (tokens per second)")
+a = ax1.bar(x_axis, data_gpu, label = 'GPU Inference', color=('#A0CA9C'))
+a[0].set_color('#A0CA9C')
+a[1].set_color('#77b196')
+a[2].set_color('#57978e')
+ax1.bar_label(a, label_type='edge', fmt="%.2f")
+ax1.set_title("GPU")
+ax1.set_ylim([0,100])
+a = ax2.bar(x_axis, data_cpu, label = 'CPU Inference', color=('#F3D683'))
+a[0].set_color('#F3D683')
+a[1].set_color('#ffd088')
+a[2].set_color('#ffc996')
+ax2.bar_label(a, label_type='edge', fmt="%.2f")
+ax2.set_title("CPU")
+ax2.set_ylim([0,5])
+plt.savefig('implementation-overhead.png', format='png', dpi=1200)
+#plt.show()
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/llama_bench_overhead.py b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/llama_bench_overhead.py
new file mode 100644
index 000000000..6efc4b20c
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/llama_bench_overhead.py
@@ -0,0 +1,47 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.interpolate import make_interp_spline
+import matplotlib.pyplot as plt
+
+# context size
+x_axis = np.array([16, 32, 64, 128, 256, 512, 1024])
+
+# ms/token
+#kv_reset = np.array([18.6,22.6,26.6,41,55, 73, 1460])
+native_gpu = np.array([97.126297, 97.044359, 95.145198,95.544189,94.979616,93.985399, 92.903395])
+vm_gpu = np.array([92.169766, 94.367953, 94.813442, 94.287733, 93.690567, 92.628699, 91.229307])
+
+native_cpu = np.array([3.198267,3.228166, 3.229830, 3.217932, 3.209158, 3.180029, 3.093288])
+vm_cpu = np.array([3.144735, 3.183336,3.168134, 3.145999,3.157139,3.118043,3.069257])
+
+# smooth out curves
+#x_axis_new = np.linspace(x_axis.min(), x_axis.max(), 500)
+
+print(np.average(native_gpu) / np.average(vm_gpu) * 100)
+print(np.average(native_cpu) / np.average(vm_cpu)  * 100)
+
+fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(9, 4))
+fig.suptitle('Inference overhead of virtualization with llama-bench')
+
+#plt.title("LLM-OS overhead")
+fig.supxlabel("Number of tokens generated")
+fig.supylabel("Throughput (tokens per second)")
+a = ax1.bar(x_axis.astype('str'), native_gpu, label = 'Native', color=('#57978e'))
+ax1.bar_label(a, label_type='edge', fmt="%.1f")
+a = ax1.bar(x_axis.astype('str'), vm_gpu, label = 'VM', color=('#A0CA9C'))
+ax1.bar_label(a, label_type='edge', fmt="%.1f",padding=-12)
+ax1.set_title("GPU")
+
+ax1.set_ylim([60,100])
+
+a = ax2.bar(x_axis.astype('str'), native_cpu, label = 'Native', color=('#b9a050'))
+ax2.bar_label(a, label_type='edge', fmt="%.2f")
+a=ax2.bar(x_axis.astype('str'), vm_cpu, label = 'VM', color=('#F3D683'))
+ax2.bar_label(a, label_type='edge', fmt="%.2f", padding=-12)
+ax2.set_title("CPU")
+ax2.set_ylim([1,5])
+ax2.legend()
+
+plt.savefig('llama-bench-overhead.png', format='png', dpi=1200)
+#plt.show()
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/mock_app_scalability.py b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/mock_app_scalability.py
new file mode 100644
index 000000000..c68077873
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/mock_app_scalability.py
@@ -0,0 +1,33 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.interpolate import make_interp_spline
+import matplotlib.pyplot as plt
+
+def add_labels(x,y, x_offset, y_offset):
+    for i in range(len(x)):
+        plt.text(i + x_offset, y[i] + y_offset, y[i], ha = 'center')
+
+x = ['256', '512', '1024', '2048']
+x_axis = np.arange(len(x))
+
+y_axis = [0, 5, 10, 15, 20]
+
+
+no_opt  = [16, 8, 4, 2]
+optim = [20, 12, 6, 3]
+
+plt.xticks(x_axis, x)
+plt.yticks(y_axis, (str(i) for i in y_axis))
+plt.title("Higher is better ↑", fontsize=9, color="navy", weight="bold")
+plt.xlabel("Context size (tokens)")
+plt.ylabel("# apps active")
+plt.bar(x_axis - 0.2, no_opt, 0.4, label = 'No mem. optimizations', edgecolor = "black")
+plt.bar(x_axis + 0.2, optim, 0.4, label = 'Mem. optimizations', edgecolor = "black")
+add_labels(x, no_opt, -0.2, 0.2)
+add_labels(x, optim, 0.2, 0.2)
+plt.legend()
+#plt.grid()
+
+plt.savefig('mock_app_scalability.png', format='png', dpi=1200)
+plt.show()
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/mock_impact.py b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/mock_impact.py
new file mode 100644
index 000000000..dfb41f338
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/mock_impact.py
@@ -0,0 +1,25 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.interpolate import make_interp_spline
+import matplotlib.pyplot as plt
+
+# context size
+x = ['Linux Kernel', 'Godot', 'gcc', 'gdb']
+x_axis = np.arange(len(x))
+
+
+idle  = [15 * 60, 10 * 60, 12 * 60, 11 * 60]
+infer = [17 * 60, 11 * 60, 14 * 60, 12 * 60]
+
+plt.xticks(x_axis, x)
+plt.title("Lower is better ↓", fontsize=9, color="navy", weight="bold")
+plt.xlabel("Compilation task")
+plt.ylabel("Time [s]")
+plt.bar(x_axis - 0.2, idle, 0.4, label = 'LLM-OS idle', edgecolor = "black")
+plt.bar(x_axis + 0.2, infer, 0.4, label = 'LLM-OS inference', edgecolor = "black")
+plt.legend()
+#plt.grid()
+
+plt.savefig('mock_impact.png', format='png', dpi=1200)
+plt.show()
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/mock_overhead.py b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/mock_overhead.py
new file mode 100644
index 000000000..1f7183faa
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/mock_overhead.py
@@ -0,0 +1,40 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.interpolate import make_interp_spline
+import matplotlib.pyplot as plt
+
+# context size
+x_axis = np.array([16, 32, 64, 128, 256, 512, 1024])
+
+# ms/token
+native = np.array([2, 3, 5, 7, 9, 12, 15])
+virt = np.array([4, 5, 7, 9, 11, 14, 17])
+trust = np.array([5, 6, 8, 10, 12, 15, 18])
+
+# smooth out curves
+x_axis_new = np.linspace(x_axis.min(), x_axis.max(), 500)
+
+native_spline = make_interp_spline(x_axis, native)
+native_smooth = native_spline(x_axis_new)
+
+virt_spline = make_interp_spline(x_axis, virt)
+virt_smooth = virt_spline(x_axis_new)
+
+trust_spline = make_interp_spline(x_axis, trust)
+trust_smooth = trust_spline(x_axis_new)
+
+#plt.title("LLM-OS overhead")
+plt.title("Lower is better ↓", fontsize=9, color="navy", weight="bold")
+plt.xlabel("Context size (tokens)")
+plt.ylabel("Inference latency (ms/token)")
+plt.plot(x_axis_new, native_smooth, label = 'Native')
+plt.plot(x_axis_new, virt_smooth, label = 'Virtualized')
+plt.plot(x_axis_new, trust_smooth, label = 'LLM-OS')
+plt.legend()
+plt.grid()
+x_ticks = np.linspace(0, 1024, 5, dtype=int)
+plt.xticks(x_ticks, (str(i) for i in x_ticks))
+
+plt.savefig('mock_overhead.png', format='png', dpi=1200)
+plt.show()
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/scaling.py b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/scaling.py
new file mode 100644
index 000000000..0172faa7c
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/scaling.py
@@ -0,0 +1,39 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.interpolate import make_interp_spline
+import matplotlib.pyplot as plt
+
+# context size
+x_axis = np.array([1, 2, 4, 8, 16])
+
+# ms/token
+#kv_reset = np.array([18.6,22.6,26.6,41,55, 73, 1460])
+same_prio_gpu = np.array([81.8536, 53.9743, 52.2258, 52.411, 50.3522])
+diff_prio_gpu = np.array([81.8536, 80.4614, 72.5223, 72.9985, 71.122])
+
+same_prio_cpu = np.array([2.83991, 0.589248, 0.820965, 1.08444, 1.30965])
+diff_prio_cpu = np.array([2.83991, 3.00996, 3.01702, 3.01491, 2.45732])
+
+# smooth out curves
+#x_axis_new = np.linspace(x_axis.min(), x_axis.max(), 500)
+
+fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(10, 5))
+fig.suptitle('Throughput change with number of clients')
+
+#plt.title("LLM-OS overhead")
+#ax1.title("Througput change with number of same priority client")
+fig.supxlabel("Number of active clients")
+fig.supylabel("Throughput (tokens per second)")
+ax1.plot(x_axis, same_prio_gpu, label = 'Same priotiy requests')
+ax1.plot(x_axis, diff_prio_gpu, label = 'Different priotiy requests')
+ax1.set_title("GPU")
+
+ax2.plot(x_axis, same_prio_cpu, label = 'Same priotiy requests')
+ax2.plot(x_axis, diff_prio_cpu, label = 'Different priotiy requests')
+ax2.set_title("CPU")
+#plt.plot(x_axis, same_prio_gpu, label = 'Same priotiy requests on CPU')
+#plt.plot(x_axis, same_prio_gpu, label = 'Different priotiy requests on CPU')
+ax2.legend()
+plt.savefig('server-client-scaling.png', format='png', dpi=1200)
+#plt.show()
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/scaling_bar.py b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/scaling_bar.py
new file mode 100644
index 000000000..7f643eb60
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/scaling_bar.py
@@ -0,0 +1,48 @@
+import matplotlib.pyplot as plt
+import numpy as np
+
+# context size
+x_axis = np.array([2, 4, 8, 16])
+
+# Data
+same_prio_gpu = np.array([53.8424, 52.2921, 52.612, 52.4502])
+diff_prio_gpu = np.array([80.6864, 70.6828, 72.5182, 73.5048])
+
+same_prio_cpu = np.array([0.445242,0.611119, 0.8198, 1.02924])
+diff_prio_cpu = np.array([3.01894, 3.03376, 3.01022, 2.52995])
+
+base_gpu = np.full((4,1), 83.4922)
+base_cpu = np.full((4,1), 3.07812)
+
+# Bar width and x-axis offset
+bar_width = 0.35
+x_indices = np.arange(len(x_axis))
+
+# Create plots
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
+fig.suptitle('Throughput change with number of clients')
+fig.supxlabel("Number of active clients")
+fig.supylabel("Throughput (tokens per second)")
+
+# GPU bar chart
+ax1.bar(x_indices - bar_width/2, same_prio_gpu, width=bar_width, label='Same Priority Requests', color=('#57978e'))
+ax1.bar(x_indices + bar_width/2, diff_prio_gpu, width=bar_width, label='Different Priority Requests', color=('#A0CA9C'))
+ax1.hlines(y=83.4922, xmin=-0.5, xmax=3.5, colors='#747474', linestyles='--', label="Single Request")
+ax1.set_xticks(x_indices)
+ax1.set_xticklabels(x_axis)
+ax1.set_title("GPU")
+ax1.legend(loc='lower left')
+
+# CPU bar chart
+ax2.bar(x_indices - bar_width/2, same_prio_cpu, width=bar_width, label='Same Priority Requests', color=('#b9a050'))
+ax2.bar(x_indices + bar_width/2, diff_prio_cpu, width=bar_width, label='Different Priority requests', color=('#F3D683'))
+ax2.hlines(y=3.01894, xmin=-0.5, xmax=3.5, colors='#747474', linestyles='--', label="Single Request")
+ax2.set_xticks(x_indices)
+ax2.set_xticklabels(x_axis)
+ax2.set_title("CPU")
+ax2.legend(loc='lower right')
+
+
+plt.tight_layout()
+plt.savefig('server-client-scaling.png', format='png', dpi=1200)
+#plt.show()
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/scheduler_n_tokens.py b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/scheduler_n_tokens.py
new file mode 100644
index 000000000..86cac6367
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/scheduler_n_tokens.py
@@ -0,0 +1,44 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.interpolate import make_interp_spline
+import matplotlib.pyplot as plt
+
+# context size
+x_axis = np.array([10, 20, 30, 40, 50, 100, "Single\nClient"])
+
+# ms/token
+vm_gpu = np.array([52.4502, 63.1353, 70.7027, 73.7928, 76.4382, 79.096, 0])
+vm_cpu = np.array([1.02924, 1.34828,1.64041, 1.8008,1.86078,2.48173, 0])
+
+baseline_gpu_1_client = np.array([0,0,0,0,0,0, 81.8536])
+baseline_cpu_1_client = np.array([0,0,0,0,0,0, 2.83991])
+
+# smooth out curves
+#x_axis_new = np.linspace(x_axis.min(), x_axis.max(), 500)
+
+fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(12, 5.5))
+fig.suptitle("Scheduling Granulaties' Effect on Performance of 16 Concurrent Clients")
+
+#plt.title("LLM-OS overhead")
+fig.supxlabel("Number of tokens to generate between scheduler calls")
+fig.supylabel("Throughput (tokens per second)")
+a = ax1.bar(x_axis.astype('str'), vm_gpu, label = 'GPU Throughput', color=('#A0CA9C'))
+ax1.bar_label(a, label_type='edge', fmt=lambda x: "{:.2f}".format(x) if x > 0 else '')
+a = ax1.bar(x_axis.astype('str'), baseline_gpu_1_client, label = 'Baseline', color=('#747474'))
+ax1.bar_label(a, label_type='edge', fmt=lambda x: "{:.2f}".format(x) if x > 0 else '')
+ax1.set_title("GPU")
+ax1.legend()
+
+ax1.set_ylim([0,100])
+
+a = ax2.bar(x_axis.astype('str'), vm_cpu, label = 'CPU Throughput', color=('#F3D683'))
+ax2.bar_label(a, label_type='edge', fmt=lambda x: "{:.2f}".format(x) if x > 0 else '')
+a = ax2.bar(x_axis.astype('str'), baseline_cpu_1_client, label = 'Baseline', color=('#747474'))
+ax2.bar_label(a, label_type='edge', fmt=lambda x: "{:.2f}".format(x) if x > 0 else '')
+ax2.set_title("CPU")
+ax2.set_ylim([0,5])
+ax2.legend()
+
+plt.savefig('scheduler.png', format='png', dpi=1200)
+#plt.show()
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/server_client_overhead.py b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/server_client_overhead.py
new file mode 100644
index 000000000..027d9371c
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/pyplots/server_client_overhead.py
@@ -0,0 +1,28 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.interpolate import make_interp_spline
+import matplotlib.pyplot as plt
+
+# context size
+x_axis = np.array(['Native', 'VM'])
+
+# ms/token
+#kv_reset = np.array([18.6,22.6,26.6,41,55, 73, 1460])
+data_gpu = np.array([86.4729, 81.8536])
+data_cpu = np.array([1.61976, 2.83991])
+
+# smooth out curves
+#x_axis_new = np.linspace(x_axis.min(), x_axis.max(), 500)
+
+
+
+#plt.title("LLM-OS overhead")
+plt.title("Inference overhead of virtualization", fontsize=9, color="navy", weight="bold")
+plt.xlabel("Execution environment")
+plt.ylabel("Throughput (tokens per second)")
+plt.bar(x_axis, data_gpu, label = 'GPU Inference')
+plt.bar(x_axis, data_cpu, label = 'CPU Inference')
+plt.legend()
+plt.savefig('server-client-overhead.png', format='png', dpi=1200)
+#plt.show()
+
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/bind.sh b/archive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/bind.sh
new file mode 100755
index 000000000..173a90a7c
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/bind.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+gpu="0000:ca:00.0"
+gpu_vd="$(cat /sys/bus/pci/devices/$gpu/vendor) $(cat /sys/bus/pci/devices/$gpu/device)"
+
+echo "$gpu_vd" > "/sys/bus/pci/drivers/vfio-pci/remove_id"
+echo 1 > "/sys/bus/pci/devices/$gpu/remove"
+echo 1 > "/sys/bus/pci/rescan"
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/start_qemu.sh b/archive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/start_qemu.sh
new file mode 100755
index 000000000..5f66505ab
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/start_qemu.sh
@@ -0,0 +1 @@
+qemu-system-x86_64  -enable-kvm -hda deb.img -cdrom ubuntu-24.04.1-live-server-amd64.iso -m 65536 -display curses -nographic -device vfio-pci,host=ca:00.0 -smp 32,sockets=1,cores=16,threads=2,maxcpus=32
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/start_qemu_no_gpu.sh b/archive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/start_qemu_no_gpu.sh
new file mode 100755
index 000000000..d098fec1d
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/start_qemu_no_gpu.sh
@@ -0,0 +1 @@
+qemu-system-x86_64  -enable-kvm -hda deb.img -cdrom ubuntu-24.04.1-live-server-amd64.iso -m 65536 -display curses -nographic -smp 32,sockets=1,cores=16,threads=2,maxcpus=32
diff --git a/archive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/unbind.sh b/archive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/unbind.sh
new file mode 100755
index 000000000..e387f593b
--- /dev/null
+++ b/archive/2025/summer/msc_berkay_eren_ueruen/qemu_scripts/unbind.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+gpu="0000:ca:00.0"
+gpu_vd="$(cat /sys/bus/pci/devices/$gpu/vendor) $(cat /sys/bus/pci/devices/$gpu/device)"
+
+echo "$gpu" > "/sys/bus/pci/devices/$gpu/driver/unbind"
+echo "$gpu_vd" > /sys/bus/pci/drivers/vfio-pci/new_id
+