Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
option(LLAMA_MOE_ENABLE "llama: enable experimental MoE runtime" OFF)

# 3rd party libs
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
Expand All @@ -111,6 +112,10 @@ set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})

if (LLAMA_MOE_ENABLE)
add_compile_definitions(LLAMA_MOE_ENABLE)
endif()

# change the default for these ggml options
if (NOT DEFINED GGML_LLAMAFILE)
set(GGML_LLAMAFILE_DEFAULT ON)
Expand Down Expand Up @@ -176,6 +181,9 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
add_subdirectory(ggml)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
if (GGML_CUDA)
enable_language(CUDA)
endif()
endif()

if (MINGW)
Expand Down
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,20 @@ The Hugging Face platform provides a variety of online tools for converting, qua

- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggml-org/llama.cpp/discussions/10123)

### Converting MoE models

Models with Mixture-of-Experts layers should be exported with the new `GGUF_MOE` metadata so that llama.cpp can route and cache experts lazily. The high-level steps are:

1. Convert the base model with `convert_hf_to_gguf.py --moe` (see the updated script usage below).
2. Ensure the converter emits router tensors (`blk.N.router.*`) and per-expert tensor groups (`blk.N.expert.K.W1`, `W2`, `W3`, …).
3. Provide per-layer metadata keys:
- `moe.layer.N.num_experts`
- `moe.layer.N.top_k`
- optionally, `moe.layer.N.router_type`
4. Run `python examples/moe_loader.py --validate path/to/model.gguf` to verify expert handles before inference.

With these fields populated, llama.cpp will mmap each expert independently and hydrate them into GPU memory only when the router selects them.
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)

Expand Down
30 changes: 30 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1951,6 +1951,36 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.kv_unified = true;
}
).set_env("LLAMA_ARG_KV_SPLIT"));
#ifdef LLAMA_MOE_ENABLE
add_opt(common_arg(
{"--moe-enable"},
"enable dynamic Mixture-of-Experts routing with on-demand expert caching",
[](common_params & params) {
params.moe_enable = true;
}
).set_env("LLAMA_ARG_MOE_ENABLE"));
add_opt(common_arg(
{"--moe-cache-size"}, "N",
string_format("number of experts pinned in VRAM per device (default: %d, 0 = auto)", params.moe_cache_size),
[](common_params & params, int value) {
params.moe_cache_size = value;
}
).set_env("LLAMA_ARG_MOE_CACHE"));
add_opt(common_arg(
{"--moe-prefetch"},
string_format("overlap expert DMA with compute (default: %s)", params.moe_prefetch ? "true" : "false"),
[](common_params & params) {
params.moe_prefetch = true;
}
).set_env("LLAMA_ARG_MOE_PREFETCH"));
add_opt(common_arg(
{"--moe-prefetch-lookahead"}, "N",
string_format("number of micro-batches to prefetch ahead (default: %d)", params.moe_prefetch_lookahead),
[](common_params & params, int value) {
params.moe_prefetch_lookahead = value;
}
).set_env("LLAMA_ARG_MOE_PREFETCH_LOOKAHEAD"));
#endif
add_opt(common_arg(
{"--no-context-shift"},
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
Expand Down
14 changes: 10 additions & 4 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1180,10 +1180,16 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload;
cparams.no_perf = params.no_perf;
cparams.op_offload = !params.no_op_offload;
cparams.swa_full = params.swa_full;
cparams.kv_unified = params.kv_unified;
cparams.no_perf = params.no_perf;
cparams.op_offload = !params.no_op_offload;
cparams.swa_full = params.swa_full;
cparams.kv_unified = params.kv_unified;
#ifdef LLAMA_MOE_ENABLE
cparams.moe_enable = params.moe_enable;
cparams.moe_cache_size = params.moe_cache_size > 0 ? (uint32_t) params.moe_cache_size : 0;
cparams.moe_prefetch = params.moe_prefetch;
cparams.moe_prefetch_lookahead = params.moe_prefetch_lookahead > 0 ? (uint32_t) params.moe_prefetch_lookahead : 1;
#endif

cparams.type_k = params.cache_type_k;
cparams.type_v = params.cache_type_v;
Expand Down
7 changes: 7 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,13 @@ struct common_params {

enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

#ifdef LLAMA_MOE_ENABLE
bool moe_enable = false; // enable dynamic MoE routing
int32_t moe_cache_size = 0; // number of experts kept resident per device
bool moe_prefetch = false; // enable async prefetch
int32_t moe_prefetch_lookahead = 1; // number of micro-batches to prefetch
#endif

struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;

Expand Down
16 changes: 16 additions & 0 deletions docs/development/HOWTO-add-model.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,22 @@ Note: to debug the inference graph: you can use [llama-eval-callback](/examples/

https://github.com/ggml-org/ggml/blob/master/docs/gguf.md

### GGUF_MOE (provisional)

The `GGUF_MOE` extension introduces explicit router tensors and per-expert tensor groups that can be dynamically paged in at runtime. When exporting a model with mixture-of-experts layers, populate the following metadata keys and tensor groups:

- Metadata keys (per MoE layer):
- `moe.layer.{i}.num_experts` – total number of experts in the layer.
- `moe.layer.{i}.top_k` – active experts per token.
- `moe.layer.{i}.router_type` – optional string describing router activation (e.g. `softmax`).
- Router tensors:
- `blk.{i}.router.w1`, `blk.{i}.router.w2` (plus bias variants when present).
- Expert tensor groups:
- `blk.{i}.expert.{e}.w1`, `blk.{i}.expert.{e}.w2`, `blk.{i}.expert.{e}.w3`, etc., matching the FFN projections for each expert `e`.
- Shared expert tensors continue to use the existing `ffn_*_shexp` names.

All expert tensors must be stored as standalone GGUF entries (not packed in the last dimension). This allows llama.cpp to mmap each expert independently and back the CUDA ExpertCache with fine-grained handles.

## Resources

- YaRN RoPE scaling https://github.com/ggml-org/llama.cpp/pull/2268
Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ else()
add_subdirectory(gen-docs)
add_subdirectory(training)
add_subdirectory(diffusion)
add_subdirectory(moe)
add_subdirectory(model-conversion)
if (NOT GGML_BACKEND_DL)
add_subdirectory(convert-llama2c-to-ggml)
Expand Down
3 changes: 3 additions & 0 deletions examples/moe/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
add_executable(moe-loader main.cpp)
target_link_libraries(moe-loader PRIVATE llama Threads::Threads)
target_include_directories(moe-loader PRIVATE ${CMAKE_SOURCE_DIR}/src)
Loading