ggml-org · jmangold23 · Nov 6, 2025
@@ -89,6 +89,7 @@ option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})
+option(LLAMA_MOE_ENABLE     "llama: enable experimental MoE runtime" OFF)
 
 # 3rd party libs
 option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
@@ -111,6 +112,10 @@ set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
 set(GGML_ALL_WARNINGS   ${LLAMA_ALL_WARNINGS})
 set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
 
+if (LLAMA_MOE_ENABLE)
+    add_compile_definitions(LLAMA_MOE_ENABLE)
+endif()
+
 # change the default for these ggml options
 if (NOT DEFINED GGML_LLAMAFILE)
     set(GGML_LLAMAFILE_DEFAULT ON)
@@ -176,6 +181,9 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
     set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
     add_subdirectory(ggml)
     # ... otherwise assume ggml is added by a parent CMakeLists.txt
+    if (GGML_CUDA)
+        enable_language(CUDA)
+    endif()
 endif()
 
 if (MINGW)

@@ -305,6 +305,20 @@ The Hugging Face platform provides a variety of online tools for converting, qua
 
 - Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
 - Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggml-org/llama.cpp/discussions/10123)
+
+### Converting MoE models
+
+Models with Mixture-of-Experts layers should be exported with the new `GGUF_MOE` metadata so that llama.cpp can route and cache experts lazily. The high-level steps are:
+
+1. Convert the base model with `convert_hf_to_gguf.py --moe` (see the updated script usage below).
+2. Ensure the converter emits router tensors (`blk.N.router.*`) and per-expert tensor groups (`blk.N.expert.K.W1`, `W2`, `W3`, …).
+3. Provide per-layer metadata keys:
+   - `moe.layer.N.num_experts`
+   - `moe.layer.N.top_k`
+   - optionally, `moe.layer.N.router_type`
+4. Run `python examples/moe_loader.py --validate path/to/model.gguf` to verify expert handles before inference.
+
+With these fields populated, llama.cpp will mmap each expert independently and hydrate them into GPU memory only when the router selects them.
 - Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
 - Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)
 

@@ -1951,6 +1951,36 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.kv_unified = true;
         }
     ).set_env("LLAMA_ARG_KV_SPLIT"));
+#ifdef LLAMA_MOE_ENABLE
+    add_opt(common_arg(
+        {"--moe-enable"},
+        "enable dynamic Mixture-of-Experts routing with on-demand expert caching",
+        [](common_params & params) {
+            params.moe_enable = true;
+        }
+    ).set_env("LLAMA_ARG_MOE_ENABLE"));
+    add_opt(common_arg(
+        {"--moe-cache-size"}, "N",
+        string_format("number of experts pinned in VRAM per device (default: %d, 0 = auto)", params.moe_cache_size),
+        [](common_params & params, int value) {
+            params.moe_cache_size = value;
+        }
+    ).set_env("LLAMA_ARG_MOE_CACHE"));
+    add_opt(common_arg(
+        {"--moe-prefetch"},
+        string_format("overlap expert DMA with compute (default: %s)", params.moe_prefetch ? "true" : "false"),
+        [](common_params & params) {
+            params.moe_prefetch = true;
+        }
+    ).set_env("LLAMA_ARG_MOE_PREFETCH"));
+    add_opt(common_arg(
+        {"--moe-prefetch-lookahead"}, "N",
+        string_format("number of micro-batches to prefetch ahead (default: %d)", params.moe_prefetch_lookahead),
+        [](common_params & params, int value) {
+            params.moe_prefetch_lookahead = value;
+        }
+    ).set_env("LLAMA_ARG_MOE_PREFETCH_LOOKAHEAD"));
+#endif
     add_opt(common_arg(
         {"--no-context-shift"},
         string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),

@@ -1180,10 +1180,16 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.cb_eval           = params.cb_eval;
     cparams.cb_eval_user_data = params.cb_eval_user_data;
     cparams.offload_kqv       = !params.no_kv_offload;
-    cparams.no_perf           = params.no_perf;
-    cparams.op_offload        = !params.no_op_offload;
-    cparams.swa_full          = params.swa_full;
-    cparams.kv_unified        = params.kv_unified;
+   cparams.no_perf           = params.no_perf;
+   cparams.op_offload        = !params.no_op_offload;
+   cparams.swa_full          = params.swa_full;
+   cparams.kv_unified        = params.kv_unified;
+#ifdef LLAMA_MOE_ENABLE
+    cparams.moe_enable             = params.moe_enable;
+    cparams.moe_cache_size         = params.moe_cache_size > 0 ? (uint32_t) params.moe_cache_size : 0;
+    cparams.moe_prefetch           = params.moe_prefetch;
+    cparams.moe_prefetch_lookahead = params.moe_prefetch_lookahead > 0 ? (uint32_t) params.moe_prefetch_lookahead : 1;
+#endif
 
     cparams.type_k = params.cache_type_k;
     cparams.type_v = params.cache_type_v;

@@ -302,6 +302,13 @@ struct common_params {
 
     enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
 
+#ifdef LLAMA_MOE_ENABLE
+    bool    moe_enable             = false; // enable dynamic MoE routing
+    int32_t moe_cache_size         = 0;     // number of experts kept resident per device
+    bool    moe_prefetch           = false; // enable async prefetch
+    int32_t moe_prefetch_lookahead = 1;     // number of micro-batches to prefetch
+#endif
+
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
 

diff --git a/docs/development/HOWTO-add-model.md b/docs/development/HOWTO-add-model.md
@@ -117,6 +117,22 @@ Note: to debug the inference graph: you can use [llama-eval-callback](/examples/
 
 https://github.com/ggml-org/ggml/blob/master/docs/gguf.md
 
+### GGUF_MOE (provisional)
+
+The `GGUF_MOE` extension introduces explicit router tensors and per-expert tensor groups that can be dynamically paged in at runtime. When exporting a model with mixture-of-experts layers, populate the following metadata keys and tensor groups:
+
+- Metadata keys (per MoE layer):
+  - `moe.layer.{i}.num_experts` – total number of experts in the layer.
+  - `moe.layer.{i}.top_k` – active experts per token.
+  - `moe.layer.{i}.router_type` – optional string describing router activation (e.g. `softmax`).
+- Router tensors:
+  - `blk.{i}.router.w1`, `blk.{i}.router.w2` (plus bias variants when present).
+- Expert tensor groups:
+  - `blk.{i}.expert.{e}.w1`, `blk.{i}.expert.{e}.w2`, `blk.{i}.expert.{e}.w3`, etc., matching the FFN projections for each expert `e`.
+  - Shared expert tensors continue to use the existing `ffn_*_shexp` names.
+
+All expert tensors must be stored as standalone GGUF entries (not packed in the last dimension). This allows llama.cpp to mmap each expert independently and back the CUDA ExpertCache with fine-grained handles.
+
 ## Resources
 
 - YaRN RoPE scaling https://github.com/ggml-org/llama.cpp/pull/2268

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -33,6 +33,7 @@ else()
     add_subdirectory(gen-docs)
     add_subdirectory(training)
     add_subdirectory(diffusion)
+    add_subdirectory(moe)
     add_subdirectory(model-conversion)
     if (NOT GGML_BACKEND_DL)
         add_subdirectory(convert-llama2c-to-ggml)

diff --git a/examples/moe/CMakeLists.txt b/examples/moe/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_executable(moe-loader main.cpp)
+target_link_libraries(moe-loader PRIVATE llama Threads::Threads)
+target_include_directories(moe-loader PRIVATE ${CMAKE_SOURCE_DIR}/src)