From 908eef83ca0ed9f2119aa5264ffd6a31abfe34ff Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Date: Thu, 20 Nov 2025 07:33:56 -0500
Subject: [PATCH 01/11] Add llama-bench for android (#1)

---
 .../extra-android-android-arm64-tools.yml     | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 .github/workflows/extra-android-android-arm64-tools.yml

diff --git a/.github/workflows/extra-android-android-arm64-tools.yml b/.github/workflows/extra-android-android-arm64-tools.yml
new file mode 100644
index 00000000000..90858383981
--- /dev/null
+++ b/.github/workflows/extra-android-android-arm64-tools.yml
@@ -0,0 +1,66 @@
+name: Android ARM64 llama-bench Build
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - benchmarks
+
+  pull_request:
+    branches:
+      - benchmarks
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  # https://developer.android.com/ndk/downloads
+  NDK_VERSION: "29.0.14206865"
+
+jobs:
+  android-arm64-llama-bench:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v5
+
+      - uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: android-arm64-llama-bench
+
+      - uses: actions/setup-java@v3
+        with:
+          java-version: "17"
+          distribution: "temurin"
+
+      - uses: android-actions/setup-android@v3
+        with:
+          log-accepted-android-sdk-licenses: false
+
+      - run: |
+          sdkmanager "ndk;${{ env.NDK_VERSION }}"
+          echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
+
+      - run: |
+          cmake -B build -G Ninja \
+            -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
+            -DANDROID_ABI=arm64-v8a \
+            -DANDROID_PLATFORM=android-28 \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_NATIVE=OFF \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_CPU_ALL_VARIANTS=ON \
+            -DLLAMA_CURL=OFF \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_SERVER=OFF
+          cmake --build build --config Release --target llama-bench
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: llama-bench-android-arm64-v8a
+          path: build/bin/*
+          if-no-files-found: error

From 0626fa061ff2ff8e3a6be3b0069beeef0d192526 Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Date: Thu, 20 Nov 2025 07:54:33 -0500
Subject: [PATCH 02/11] Add Vulkan build (#2)

---
 .../extra-android-android-arm64-tools.yml     |  66 --------
 .github/workflows/extra_benchmark_tools.yml   | 148 ++++++++++++++++++
 2 files changed, 148 insertions(+), 66 deletions(-)
 delete mode 100644 .github/workflows/extra-android-android-arm64-tools.yml
 create mode 100644 .github/workflows/extra_benchmark_tools.yml

diff --git a/.github/workflows/extra-android-android-arm64-tools.yml b/.github/workflows/extra-android-android-arm64-tools.yml
deleted file mode 100644
index 90858383981..00000000000
--- a/.github/workflows/extra-android-android-arm64-tools.yml
+++ /dev/null
@@ -1,66 +0,0 @@
-name: Android ARM64 llama-bench Build
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - benchmarks
-
-  pull_request:
-    branches:
-      - benchmarks
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-env:
-  # https://developer.android.com/ndk/downloads
-  NDK_VERSION: "29.0.14206865"
-
-jobs:
-  android-arm64-llama-bench:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v5
-
-      - uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: android-arm64-llama-bench
-
-      - uses: actions/setup-java@v3
-        with:
-          java-version: "17"
-          distribution: "temurin"
-
-      - uses: android-actions/setup-android@v3
-        with:
-          log-accepted-android-sdk-licenses: false
-
-      - run: |
-          sdkmanager "ndk;${{ env.NDK_VERSION }}"
-          echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
-
-      - run: |
-          cmake -B build -G Ninja \
-            -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
-            -DANDROID_ABI=arm64-v8a \
-            -DANDROID_PLATFORM=android-28 \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_NATIVE=OFF \
-            -DGGML_BACKEND_DL=ON \
-            -DGGML_CPU_ALL_VARIANTS=ON \
-            -DLLAMA_CURL=OFF \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF
-          cmake --build build --config Release --target llama-bench
-
-      - uses: actions/upload-artifact@v4
-        with:
-          name: llama-bench-android-arm64-v8a
-          path: build/bin/*
-          if-no-files-found: error
diff --git a/.github/workflows/extra_benchmark_tools.yml b/.github/workflows/extra_benchmark_tools.yml
new file mode 100644
index 00000000000..9c90343b44e
--- /dev/null
+++ b/.github/workflows/extra_benchmark_tools.yml
@@ -0,0 +1,148 @@
+name: Extra Benchmark Tools
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - benchmarks
+
+  pull_request:
+    branches:
+      - benchmarks
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  # https://developer.android.com/ndk/downloads
+  NDK_VERSION: "29.0.14206865"
+
+jobs:
+  android-arm64:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v5
+
+      - uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: android-arm64-llama-bench
+
+      - uses: actions/setup-java@v3
+        with:
+          java-version: "17"
+          distribution: "temurin"
+
+      - uses: android-actions/setup-android@v3
+        with:
+          log-accepted-android-sdk-licenses: false
+
+      - run: |
+          sdkmanager "ndk;${{ env.NDK_VERSION }}"
+          echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
+
+      - run: |
+          cmake -B build -G Ninja \
+            -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
+            -DANDROID_ABI=arm64-v8a \
+            -DANDROID_PLATFORM=android-28 \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_NATIVE=OFF \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_CPU_ALL_VARIANTS=ON \
+            -DLLAMA_CURL=OFF \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_SERVER=OFF
+          cmake --build build --config Release --target llama-bench
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: llama-bench-android-arm64-v8a
+          path: build/bin/*
+          if-no-files-found: error
+
+  windows:
+    runs-on: windows-2025
+
+    env:
+      OPENBLAS_VERSION: 0.3.23
+      VULKAN_VERSION: 1.4.313.2
+
+    strategy:
+      matrix:
+        include:
+          - backend: 'vulkan'
+            arch: 'x64'
+            defines: '-DGGML_VULKAN=ON'
+            target: 'ggml-vulkan'
+          - backend: 'opencl-adreno'
+            arch: 'arm64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
+            target: 'ggml-opencl'
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
+          variant: ccache
+          evict-old-files: 1d
+
+      - name: Install Vulkan SDK
+        id: get_vulkan
+        if: ${{ matrix.backend == 'vulkan' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
+          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
+          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
+          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Install OpenCL Headers and Libs
+        id: install_opencl
+        if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }}
+        run: |
+          git clone https://github.com/KhronosGroup/OpenCL-Headers
+          cd OpenCL-Headers
+          cmake -B build `
+            -DBUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build --target install
+          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+          cd OpenCL-ICD-Loader
+          cmake -B build-arm64-release `
+            -A arm64 `
+            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build-arm64-release --target install --config release
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_SERVER=OFF
+          cmake --build build --config Release --target llama-bench
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          7z a llama-bench-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-bench-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
+          name: llama-bench-win-${{ matrix.backend }}-${{ matrix.arch }}.zip

From 1a2643deeaf63510ddbd85f7142404448039b403 Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Date: Thu, 20 Nov 2025 08:22:56 -0500
Subject: [PATCH 03/11] Add release button (#5)

---
 .github/workflows/extra_benchmark_tools.yml | 89 +++++++++++++++++++++
 1 file changed, 89 insertions(+)

diff --git a/.github/workflows/extra_benchmark_tools.yml b/.github/workflows/extra_benchmark_tools.yml
index 9c90343b44e..33b4c3d3a33 100644
--- a/.github/workflows/extra_benchmark_tools.yml
+++ b/.github/workflows/extra_benchmark_tools.yml
@@ -2,6 +2,11 @@ name: Extra Benchmark Tools
 
 on:
   workflow_dispatch:
+    inputs:
+      create_release:
+        description: 'Create new release'
+        required: true
+        type: boolean
   push:
     branches:
       - benchmarks
@@ -146,3 +151,87 @@ jobs:
         with:
           path: llama-bench-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
           name: llama-bench-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
+
+  release:
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/benchmarks' ) || github.event.inputs.create_release == 'true' }}
+
+    # Fine-grant permission
+    # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+    permissions:
+        contents: write # for creating release
+
+    runs-on: ubuntu-latest
+
+    needs:
+      - android-arm64
+      - windows
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Download artifacts
+        id: download-artifact
+        uses: actions/download-artifact@v4
+        with:
+          path: ./artifact
+          merge-multiple: true
+
+      - name: Move artifacts
+        id: move_artifacts
+        run: |
+          mkdir -p release
+          mv -v artifact/*.zip release/ || true
+
+          # Package Android artifacts
+          if [ -d "artifact/llama-bench-android-arm64-v8a" ]; then
+            cd artifact
+            zip -r ../release/llama-bench-${{ steps.tag.outputs.name }}-android-arm64-v8a.zip llama-bench-android-arm64-v8a/
+            cd ..
+          fi
+
+          # Rename Windows artifacts to include tag
+          cd release
+          for f in llama-bench-win-*.zip; do
+            if [ -f "$f" ]; then
+              newname="llama-bench-${{ steps.tag.outputs.name }}-${f#llama-bench-}"
+              mv "$f" "$newname"
+            fi
+          done
+
+      - name: Create release
+        id: create_release
+        uses: ggml-org/action-create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ steps.tag.outputs.name }}
+
+      - name: Upload release
+        id: upload_release
+        uses: actions/github-script@v3
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            const path = require('path');
+            const fs = require('fs');
+            const release_id = '${{ steps.create_release.outputs.id }}';
+            for (let file of await fs.readdirSync('./release')) {
+              if (path.extname(file) === '.zip') {
+                console.log('uploadReleaseAsset', file);
+                await github.repos.uploadReleaseAsset({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  release_id: release_id,
+                  name: file,
+                  data: await fs.readFileSync(`./release/${file}`)
+                });
+              }
+            }

From bb7e07a20e582a4ace7f48b8640e75b5cd745ae5 Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Date: Thu, 20 Nov 2025 09:29:24 -0500
Subject: [PATCH 04/11] Add release button, take 2 (#6)

---
 .github/workflows/extra_benchmark_tools.yml | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/extra_benchmark_tools.yml b/.github/workflows/extra_benchmark_tools.yml
index 33b4c3d3a33..1a9f4e97cc9 100644
--- a/.github/workflows/extra_benchmark_tools.yml
+++ b/.github/workflows/extra_benchmark_tools.yml
@@ -175,7 +175,26 @@ jobs:
 
       - name: Determine tag name
         id: tag
-        uses: ./.github/actions/get-tag-name
+        run: |
+          # Try to get existing tag
+          if git describe --tags --exact-match 2>/dev/null; then
+            TAG_NAME=$(git describe --tags --exact-match)
+            echo "tag_exists=true" >> $GITHUB_OUTPUT
+          else
+            # Generate tag name based on date and short SHA
+            TAG_NAME="bench-tools-$(date +%Y%m%d)-$(git rev-parse --short HEAD)"
+            echo "tag_exists=false" >> $GITHUB_OUTPUT
+          fi
+          echo "name=${TAG_NAME}" >> $GITHUB_OUTPUT
+          echo "Generated tag name: ${TAG_NAME}"
+
+      - name: Create tag if needed
+        if: steps.tag.outputs.tag_exists == 'false'
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git tag -a "${{ steps.tag.outputs.name }}" -m "Benchmark tools release ${{ steps.tag.outputs.name }}"
+          git push origin "${{ steps.tag.outputs.name }}"
 
       - name: Download artifacts
         id: download-artifact

From 934f1dc31b652374a5451bcf7b31f1b4d284871f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <1478977+Alcpz@users.noreply.github.com>
Date: Thu, 20 Nov 2025 14:43:53 +0000
Subject: [PATCH 05/11] llama-bench: Enabled codepath to skip depth computation
 (#7)

Implement conditional prefill computation skipping in llama-bench:
disable computation for --depth prefill while keeping it enabled for
prefill benchmarks.

- Default behavior (no flag): Depth prefill skips computation
- With `--enable-depth-computation`: Depth prefill performs full
computation
- `-p` benchmarks: Always perform computation (not affected by this
flag)
---
 include/llama.h                   |  3 +++
 src/llama-context.cpp             | 20 ++++++++++++++++++++
 src/llama-context.h               |  5 +++++
 tools/llama-bench/llama-bench.cpp |  8 ++++++++
 4 files changed, 36 insertions(+)

diff --git a/include/llama.h b/include/llama.h
index 8547226ff21..4e0b1d2093d 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -906,6 +906,9 @@ extern "C" {
     // If true, all model tensors are activated during llama_decode() to load and cache their weights.
     LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
 
+    LLAMA_API void llama_set_skip_batched_compute(struct llama_context * ctx, bool skip);
+    LLAMA_API bool llama_get_skip_batched_compute(struct llama_context * ctx);
+
     // Set abort callback
     LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 70a3ec62dfc..dd4f28a9512 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -715,6 +715,14 @@ void llama_context::set_warmup(bool value) {
     cparams.warmup = value;
 }
 
+void llama_context::set_skip_batched_compute(bool value) {
+    skip_batched_compute = value;
+}
+
+bool llama_context::get_skip_batched_compute() const {
+    return skip_batched_compute;
+}
+
 void llama_context::set_adapter_lora(
             llama_adapter_lora * adapter,
             float scale) {
@@ -1465,6 +1473,10 @@ llm_graph_params llama_context::graph_params(
 ggml_status llama_context::graph_compute(
             ggml_cgraph * gf,
                    bool   batched) {
+    if (batched && skip_batched_compute) {
+        return GGML_STATUS_SUCCESS;
+    }
+
     int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
     ggml_threadpool_t tp = batched ? threadpool_batch        : threadpool;
 
@@ -2465,6 +2477,14 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
     ctx->set_warmup(warmup);
 }
 
+void llama_set_skip_batched_compute(llama_context * ctx, bool skip) {
+    ctx->set_skip_batched_compute(skip);
+}
+
+bool llama_get_skip_batched_compute(llama_context * ctx) {
+    return ctx->get_skip_batched_compute();
+}
+
 void llama_synchronize(llama_context * ctx) {
     ctx->synchronize();
 }
diff --git a/src/llama-context.h b/src/llama-context.h
index 20cbd789554..ec7a06860d7 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -79,6 +79,8 @@ struct llama_context {
     void set_embeddings (bool value);
     void set_causal_attn(bool value);
     void set_warmup(bool value);
+    void set_skip_batched_compute(bool value);
+    bool get_skip_batched_compute() const;
 
     void set_adapter_lora(
             llama_adapter_lora * adapter,
@@ -296,6 +298,9 @@ struct llama_context {
     // env: LLAMA_GRAPH_REUSE_DISABLE
     bool graph_reuse_disable = false;
 
+    // skip batched compute (used for depth prefill in benchmarks)
+    bool skip_batched_compute = false;
+
     // perf
     mutable int64_t t_start_us  = 0;
     mutable int64_t t_load_us   = 0;
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index 852a512451d..3de18fe709e 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -344,6 +344,7 @@ struct cmd_params {
     bool                             verbose;
     bool                             progress;
     bool                             no_warmup;
+    bool                             enable_depth_computation;
     output_formats                   output_format;
     output_formats                   output_format_stderr;
 };
@@ -382,6 +383,7 @@ static const cmd_params cmd_params_defaults = {
     /* verbose              */ false,
     /* progress             */ false,
     /* no_warmup            */ false,
+    /* enable_depth_computation */ false,
     /* output_format        */ MARKDOWN,
     /* output_format_stderr */ NONE,
 };
@@ -406,6 +408,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -v, --verbose                             verbose output\n");
     printf("  --progress                                print test progress indicators\n");
     printf("  --no-warmup                               skip warmup runs before benchmarking\n");
+    printf("  --enable-depth-computation                enable computation during depth prefill (disabled by default)\n");
     if (llama_supports_rpc()) {
         printf("  -rpc, --rpc <rpc_servers>                 register RPC devices (comma separated)\n");
     }
@@ -509,6 +512,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     params.delay                = cmd_params_defaults.delay;
     params.progress             = cmd_params_defaults.progress;
     params.no_warmup            = cmd_params_defaults.no_warmup;
+    params.enable_depth_computation = cmd_params_defaults.enable_depth_computation;
 
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -933,6 +937,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 params.progress = true;
             } else if (arg == "--no-warmup") {
                 params.no_warmup = true;
+            } else if (arg == "--enable-depth-computation") {
+                params.enable_depth_computation = true;
             } else {
                 invalid_param = true;
                 break;
@@ -2160,7 +2166,9 @@ int main(int argc, char ** argv) {
                         fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
                                 i + 1, params.reps);
                     }
+                    llama_set_skip_batched_compute(ctx, !params.enable_depth_computation);
                     bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
+                    llama_set_skip_batched_compute(ctx, false);
                     if (!res) {
                         fprintf(stderr, "%s: error: failed to run depth\n", __func__);
                         exit(1);

From a956f73a0d8855c27f330c879496f28aff0d134c Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Date: Thu, 20 Nov 2025 10:00:28 -0500
Subject: [PATCH 06/11] Add release button, take 3 (#8)

---
 .github/workflows/extra_benchmark_tools.yml | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/extra_benchmark_tools.yml b/.github/workflows/extra_benchmark_tools.yml
index 1a9f4e97cc9..fdee3d6aa90 100644
--- a/.github/workflows/extra_benchmark_tools.yml
+++ b/.github/workflows/extra_benchmark_tools.yml
@@ -64,10 +64,16 @@ jobs:
             -DLLAMA_BUILD_SERVER=OFF
           cmake --build build --config Release --target llama-bench
 
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cd build/bin
+          zip -r ../../llama-bench-android-arm64-v8a.zip ./*
+
       - uses: actions/upload-artifact@v4
         with:
-          name: llama-bench-android-arm64-v8a
-          path: build/bin/*
+          name: llama-bench-android-arm64-v8a.zip
+          path: llama-bench-android-arm64-v8a.zip
           if-no-files-found: error
 
   windows:
@@ -209,16 +215,9 @@ jobs:
           mkdir -p release
           mv -v artifact/*.zip release/ || true
 
-          # Package Android artifacts
-          if [ -d "artifact/llama-bench-android-arm64-v8a" ]; then
-            cd artifact
-            zip -r ../release/llama-bench-${{ steps.tag.outputs.name }}-android-arm64-v8a.zip llama-bench-android-arm64-v8a/
-            cd ..
-          fi
-
-          # Rename Windows artifacts to include tag
+          # Rename all artifacts to include tag
           cd release
-          for f in llama-bench-win-*.zip; do
+          for f in llama-bench-*.zip; do
             if [ -f "$f" ]; then
               newname="llama-bench-${{ steps.tag.outputs.name }}-${f#llama-bench-}"
               mv "$f" "$newname"

From 2d985fa8d5bac003f836c02e20f0d752d57ce052 Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Date: Thu, 20 Nov 2025 10:16:32 -0500
Subject: [PATCH 07/11] Add macos build (#9)

---
 .github/workflows/extra_benchmark_tools.yml | 48 +++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/.github/workflows/extra_benchmark_tools.yml b/.github/workflows/extra_benchmark_tools.yml
index fdee3d6aa90..87a0dd30fb4 100644
--- a/.github/workflows/extra_benchmark_tools.yml
+++ b/.github/workflows/extra_benchmark_tools.yml
@@ -76,6 +76,53 @@ jobs:
           path: llama-bench-android-arm64-v8a.zip
           if-no-files-found: error
 
+  macOS-arm64:
+    runs-on: macos-14
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-arm64-bench
+          evict-old-files: 1d
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          cmake -B build \
+            -DCMAKE_INSTALL_RPATH='@loader_path' \
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DGGML_RPC=ON \
+            -DGGML_NATIVE=OFF \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_SERVER=OFF
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) --target llama-bench
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cd build/bin
+          zip -r ../../llama-bench-macos-arm64.zip ./*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-bench-macos-arm64.zip
+          name: llama-bench-macos-arm64.zip
+
   windows:
     runs-on: windows-2025
 
@@ -170,6 +217,7 @@ jobs:
 
     needs:
       - android-arm64
+      - macOS-arm64
       - windows
 
     steps:

From 8a656aa6fa31f7a08d379a0bcbdad7d926fd1cbc Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Date: Thu, 20 Nov 2025 11:23:53 -0500
Subject: [PATCH 08/11] Add Windows CUDA bench build job to
 extra_benchmark_tools workflow (#10)

Added a new windows-cuda job that:
- Uses Windows 2022 runner with CUDA 12.4
- Installs CUDA toolkit and Ninja build system
- Builds llama-bench with CUDA support enabled
- Packages and uploads the benchmark tool artifacts
- Follows the same pattern as the release.yml windows-cuda job

Updated the release job to depend on the new windows-cuda job.

*Make sure to read the [contributing
guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md)
before submitting a PR*

Co-authored-by: Claude <noreply@anthropic.com>
---
 .github/workflows/extra_benchmark_tools.yml | 58 +++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/.github/workflows/extra_benchmark_tools.yml b/.github/workflows/extra_benchmark_tools.yml
index 87a0dd30fb4..e7777db9ff2 100644
--- a/.github/workflows/extra_benchmark_tools.yml
+++ b/.github/workflows/extra_benchmark_tools.yml
@@ -205,6 +205,63 @@ jobs:
           path: llama-bench-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
           name: llama-bench-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
 
+  windows-cuda:
+    runs-on: windows-2022
+
+    strategy:
+      matrix:
+        cuda: ['12.4']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Install ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: windows-cuda-bench-${{ matrix.cuda }}
+          variant: ccache
+          evict-old-files: 1d
+
+      - name: Install Cuda Toolkit
+        uses: ./.github/actions/windows-setup-cuda
+        with:
+          cuda_version: ${{ matrix.cuda }}
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Build
+        id: cmake_build
+        shell: cmd
+        run: |
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
+          cmake -S . -B build -G "Ninja Multi-Config" ^
+            -DGGML_BACKEND_DL=ON ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_CUDA=ON ^
+            -DLLAMA_CURL=OFF ^
+            -DLLAMA_BUILD_TOOLS=ON ^
+            -DLLAMA_BUILD_EXAMPLES=OFF ^
+            -DLLAMA_BUILD_TESTS=OFF ^
+            -DLLAMA_BUILD_SERVER=OFF
+          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
+          cmake --build build --config Release -j %NINJA_JOBS% --target llama-bench
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          7z a llama-bench-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-bench-win-cuda-${{ matrix.cuda }}-x64.zip
+          name: llama-bench-win-cuda-${{ matrix.cuda }}-x64.zip
+
   release:
     if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/benchmarks' ) || github.event.inputs.create_release == 'true' }}
 
@@ -219,6 +276,7 @@ jobs:
       - android-arm64
       - macOS-arm64
       - windows
+      - windows-cuda
 
     steps:
       - name: Clone

From d1f2d38a9dab8d20296be2fedda22bda834d2035 Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Date: Sat, 22 Nov 2025 22:06:22 -0500
Subject: [PATCH 09/11] Add win cpu variants (#11)

---
 .github/workflows/extra_benchmark_tools.yml | 136 ++++++++++++++++++--
 1 file changed, 126 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/extra_benchmark_tools.yml b/.github/workflows/extra_benchmark_tools.yml
index e7777db9ff2..b71b2dfac59 100644
--- a/.github/workflows/extra_benchmark_tools.yml
+++ b/.github/workflows/extra_benchmark_tools.yml
@@ -123,6 +123,72 @@ jobs:
           path: llama-bench-macos-arm64.zip
           name: llama-bench-macos-arm64.zip
 
+  windows-cpu:
+    runs-on: windows-2025
+
+    strategy:
+      matrix:
+        include:
+          - arch: 'x64'
+          - arch: 'arm64'
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-cpu-bench-${{ matrix.arch }}
+          variant: ccache
+          evict-old-files: 1d
+
+      - name: Install Ninja
+        run: |
+          choco install ninja
+
+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+        with:
+          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
+
+      - name: Build
+        shell: cmd
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
+          cmake -S . -B build -G "Ninja Multi-Config" ^
+            -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_BACKEND_DL=ON ^
+            -DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
+            -DGGML_OPENMP=ON ^
+            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
+            -DLLAMA_BUILD_TOOLS=ON ^
+            -DLLAMA_BUILD_EXAMPLES=OFF ^
+            -DLLAMA_BUILD_TESTS=OFF ^
+            -DLLAMA_BUILD_SERVER=OFF
+          cmake --build build --config Release --target llama-bench
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
+          Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
+          7z a llama-bench-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-bench-win-cpu-${{ matrix.arch }}.zip
+          name: llama-bench-win-cpu-${{ matrix.arch }}.zip
+
   windows:
     runs-on: windows-2025
 
@@ -192,12 +258,12 @@ jobs:
         id: cmake_build
         run: |
           cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_SERVER=OFF
-          cmake --build build --config Release --target llama-bench
+          cmake --build build --config Release --target ${{ matrix.target }}
 
       - name: Pack artifacts
         id: pack_artifacts
         run: |
-          7z a llama-bench-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\*
+          7z a llama-bench-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
@@ -242,19 +308,30 @@ jobs:
           cmake -S . -B build -G "Ninja Multi-Config" ^
             -DGGML_BACKEND_DL=ON ^
             -DGGML_NATIVE=OFF ^
+            -DGGML_CPU=OFF ^
             -DGGML_CUDA=ON ^
-            -DLLAMA_CURL=OFF ^
-            -DLLAMA_BUILD_TOOLS=ON ^
-            -DLLAMA_BUILD_EXAMPLES=OFF ^
-            -DLLAMA_BUILD_TESTS=OFF ^
-            -DLLAMA_BUILD_SERVER=OFF
+            -DLLAMA_CURL=OFF
           set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% --target llama-bench
+          cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
 
       - name: Pack artifacts
         id: pack_artifacts
         run: |
-          7z a llama-bench-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+          7z a llama-bench-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
+
+      - name: Copy and pack Cuda runtime
+        run: |
+          echo "Cuda install location: ${{ env.CUDA_PATH }}"
+          $dst='.\build\bin\cudart\'
+          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          7z a cudart-llama-bench-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
+
+      - name: Upload Cuda runtime
+        uses: actions/upload-artifact@v4
+        with:
+          path: cudart-llama-bench-win-cuda-${{ matrix.cuda }}-x64.zip
+          name: cudart-llama-bench-win-cuda-${{ matrix.cuda }}-x64.zip
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
@@ -275,6 +352,7 @@ jobs:
     needs:
       - android-arm64
       - macOS-arm64
+      - windows-cpu
       - windows
       - windows-cuda
 
@@ -319,11 +397,49 @@ jobs:
         id: move_artifacts
         run: |
           mkdir -p release
+
+          echo "Adding CPU backend files to existing zips..."
+          for arch in x64 arm64; do
+            cpu_zip="artifact/llama-bench-win-cpu-${arch}.zip"
+            if [ ! -f "$cpu_zip" ]; then
+              echo "CPU zip not found for $arch, skipping..."
+              continue
+            fi
+            temp_dir=$(mktemp -d)
+            echo "Extracting CPU backend for $arch..."
+            unzip "$cpu_zip" -d "$temp_dir"
+
+            echo "Adding CPU files to $arch zips..."
+            for target_zip in artifact/llama-bench-win-*-${arch}.zip; do
+              if [[ "$target_zip" == "$cpu_zip" ]]; then
+                continue
+              fi
+              echo "Adding CPU backend to $(basename "$target_zip")"
+              realpath_target_zip=$(realpath "$target_zip")
+              (cd "$temp_dir" && zip -r "$realpath_target_zip" .)
+            done
+
+            rm -rf "$temp_dir"
+          done
+
+          echo "Renaming and moving zips to release..."
+          for zip_file in artifact/llama-bench-win-*.zip; do
+            base_name=$(basename "$zip_file" .zip)
+            zip_name="llama-bench-${{ steps.tag.outputs.name }}-${base_name#llama-bench-}.zip"
+            echo "Moving $zip_file to release/$zip_name"
+            mv "$zip_file" "release/$zip_name"
+          done
+
+          echo "Moving other artifacts..."
           mv -v artifact/*.zip release/ || true
 
-          # Rename all artifacts to include tag
+          # Rename remaining artifacts to include tag
           cd release
           for f in llama-bench-*.zip; do
+            # Skip already renamed files
+            if [[ "$f" == *"${{ steps.tag.outputs.name }}"* ]]; then
+              continue
+            fi
             if [ -f "$f" ]; then
               newname="llama-bench-${{ steps.tag.outputs.name }}-${f#llama-bench-}"
               mv "$f" "$newname"

From b3eeeb97b641ee3837827c7f043a187be5296c0b Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Date: Mon, 8 Dec 2025 18:36:59 -0500
Subject: [PATCH 10/11] Add ubuntu (#12)

---
 .github/workflows/extra_benchmark_tools.yml | 192 +++++++++++++++++++-
 1 file changed, 191 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/extra_benchmark_tools.yml b/.github/workflows/extra_benchmark_tools.yml
index b71b2dfac59..2b2c469a363 100644
--- a/.github/workflows/extra_benchmark_tools.yml
+++ b/.github/workflows/extra_benchmark_tools.yml
@@ -339,6 +339,180 @@ jobs:
           path: llama-bench-win-cuda-${{ matrix.cuda }}-x64.zip
           name: llama-bench-win-cuda-${{ matrix.cuda }}-x64.zip
 
+  ubuntu-cpu:
+    strategy:
+      matrix:
+        include:
+          - build: 'x64'
+            os: ubuntu-22.04
+          - build: 'arm64'
+            os: ubuntu-22.04-arm
+          # Non-public runners - commented out:
+          # - build: 's390x'
+          #   os: ubuntu-24.04-s390x
+          # - build: 'ppc64le'
+          #   os: ubuntu-24.04-ppc64le
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-cpu-bench-${{ matrix.build }}
+          evict-old-files: 1d
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_NATIVE=OFF \
+            -DGGML_BACKEND_DL=ON \
+            -DLLAMA_CURL=OFF \
+            -DGGML_OPENMP=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_SERVER=OFF
+          cmake --build build --config Release --target llama-bench -j $(nproc)
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cd build/bin
+          tar -czvf ../../llama-bench-ubuntu-cpu-${{ matrix.build }}.tar.gz ./*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-bench-ubuntu-cpu-${{ matrix.build }}.tar.gz
+          name: llama-bench-ubuntu-cpu-${{ matrix.build }}.tar.gz
+          if-no-files-found: error
+
+  ubuntu-vulkan:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-24-vulkan-bench
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo add-apt-repository -y ppa:kisak/kisak-mesa
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev
+
+      - name: Get latest Vulkan SDK version
+        id: vulkan_sdk_version
+        run: |
+          echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
+
+      - name: Use Vulkan SDK Cache
+        uses: actions/cache@v4
+        id: cache-sdk
+        with:
+          path: ./vulkan_sdk
+          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
+
+      - name: Setup Vulkan SDK
+        if: steps.cache-sdk.outputs.cache-hit != 'true'
+        uses: ./.github/actions/linux-setup-vulkan
+        with:
+          path: ./vulkan_sdk
+          version: ${{ env.VULKAN_SDK_VERSION }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source ./vulkan_sdk/setup-env.sh
+          cmake -B build \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_NATIVE=OFF \
+            -DGGML_CPU=OFF \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_VULKAN=ON \
+            -DLLAMA_CURL=OFF
+          cmake --build build --config Release --target ggml-vulkan -j $(nproc)
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cd build/bin
+          tar -czvf ../../llama-bench-ubuntu-vulkan-x64.tar.gz ./*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-bench-ubuntu-vulkan-x64.tar.gz
+          name: llama-bench-ubuntu-vulkan-x64.tar.gz
+          if-no-files-found: error
+
+  ubuntu-cuda:
+    runs-on: ubuntu-latest
+    container: nvidia/cuda:12.6.2-devel-ubuntu24.04
+
+    strategy:
+      matrix:
+        cuda: ['12.6']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        env:
+          DEBIAN_FRONTEND: noninteractive
+        run: |
+          apt update
+          apt install -y cmake build-essential ninja-build libgomp1 git
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-cuda-bench-${{ matrix.cuda }}
+          evict-old-files: 1d
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -S . -B build -G Ninja \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DCMAKE_CUDA_ARCHITECTURES=89-real \
+            -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
+            -DGGML_NATIVE=OFF \
+            -DGGML_CPU=OFF \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_CUDA=ON \
+            -DLLAMA_CURL=OFF
+          cmake --build build --config Release --target ggml-cuda
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cd build/bin
+          tar -czvf ../../llama-bench-ubuntu-cuda-${{ matrix.cuda }}-x64.tar.gz ./*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-bench-ubuntu-cuda-${{ matrix.cuda }}-x64.tar.gz
+          name: llama-bench-ubuntu-cuda-${{ matrix.cuda }}-x64.tar.gz
+          if-no-files-found: error
+
   release:
     if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/benchmarks' ) || github.event.inputs.create_release == 'true' }}
 
@@ -355,6 +529,9 @@ jobs:
       - windows-cpu
       - windows
       - windows-cuda
+      - ubuntu-cpu
+      - ubuntu-vulkan
+      - ubuntu-cuda
 
     steps:
       - name: Clone
@@ -432,6 +609,7 @@ jobs:
 
           echo "Moving other artifacts..."
           mv -v artifact/*.zip release/ || true
+          mv -v artifact/*.tar.gz release/ || true
 
           # Rename remaining artifacts to include tag
           cd release
@@ -446,6 +624,17 @@ jobs:
             fi
           done
 
+          for f in llama-bench-*.tar.gz; do
+            # Skip already renamed files
+            if [[ "$f" == *"${{ steps.tag.outputs.name }}"* ]]; then
+              continue
+            fi
+            if [ -f "$f" ]; then
+              newname="llama-bench-${{ steps.tag.outputs.name }}-${f#llama-bench-}"
+              mv "$f" "$newname"
+            fi
+          done
+
       - name: Create release
         id: create_release
         uses: ggml-org/action-create-release@v1
@@ -464,7 +653,8 @@ jobs:
             const fs = require('fs');
             const release_id = '${{ steps.create_release.outputs.id }}';
             for (let file of await fs.readdirSync('./release')) {
-              if (path.extname(file) === '.zip') {
+              const ext = path.extname(file);
+              if (ext === '.zip' || ext === '.gz') {
                 console.log('uploadReleaseAsset', file);
                 await github.repos.uploadReleaseAsset({
                   owner: context.repo.owner,

From 3e01ecd85b665213109a4499e74fb6421b4c5a61 Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Date: Wed, 14 Jan 2026 16:54:01 -0500
Subject: [PATCH 11/11] curl off (#13)

---
 .github/workflows/extra_benchmark_tools.yml | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/.github/workflows/extra_benchmark_tools.yml b/.github/workflows/extra_benchmark_tools.yml
index 2b2c469a363..f5de24f5371 100644
--- a/.github/workflows/extra_benchmark_tools.yml
+++ b/.github/workflows/extra_benchmark_tools.yml
@@ -149,16 +149,8 @@ jobs:
         run: |
           choco install ninja
 
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-        with:
-          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
-
       - name: Build
         shell: cmd
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
           call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
           cmake -S . -B build -G "Ninja Multi-Config" ^
@@ -167,7 +159,7 @@ jobs:
             -DGGML_BACKEND_DL=ON ^
             -DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
             -DGGML_OPENMP=ON ^
-            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
+            -DLLAMA_CURL=OFF ^
             -DLLAMA_BUILD_TOOLS=ON ^
             -DLLAMA_BUILD_EXAMPLES=OFF ^
             -DLLAMA_BUILD_TESTS=OFF ^
@@ -176,10 +168,7 @@ jobs:
 
       - name: Pack artifacts
         id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
-          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
           Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
           7z a llama-bench-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*