Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 15 additions & 18 deletions ggml/src/ggml-openvino/ggml-decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
ov::PartialShape stateful_kv_shape;
// GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0);
if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name);
it == m_model_params.kv_names.end()) {
m_model_params.kv_names.push_back(src_name);
Expand Down Expand Up @@ -242,18 +241,18 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
case GGML_OP_PERMUTE: {
if (node->src[0]->op != GGML_OP_VIEW) {
op_case = 1;
} else if (ggml_is_contiguous(node->src[0])) {
} else if (node->src[0]->src[0]->op == GGML_OP_NONE) {
// kv cache tensor
std::string src_name(node->view_src->name);
if (src_name.find("cache") == std::string::npos) {
op_case = 4;
int layer = extract_layer_from_name(src_name);
if (!is_swa_layer(layer)) {
op_case = 2;
} else {
int layer = extract_layer_from_name(src_name);
if (!is_swa_layer(layer)) {
op_case = 2;
} else {
op_case = 3;
}
op_case = 3;
}
} else if (node->src[0]->src[0]->op == GGML_OP_ROPE || node->src[0]->src[0]->src[0]->op == GGML_OP_ROPE) {
// rope'ed query tensor
op_case = 4;
}
break;
}
Expand Down Expand Up @@ -383,16 +382,16 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
auto name = std::string(input->name);
ov::PartialShape input_shape;

if ((op->op == GGML_OP_GET_ROWS && op->src[0]->op == GGML_OP_NONE) || op->op == GGML_OP_ROPE) {
if (is_inp_tok(input, op) || is_inp_pos(input, op)) {
// tokens or positions
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
input_shape = ov::PartialShape{1, 1, 1, len};

} else if (op->op == GGML_OP_GET_ROWS) {
} else if (is_output_idx(input, op)) {
// output index
input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};

} else if (op->op == GGML_OP_CPY || op->op == GGML_OP_FLASH_ATTN_EXT) {
} else if (is_inp_mask(input, op)) {
// mask
if (m_is_static) {
input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx};
Expand All @@ -402,15 +401,15 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
input_shape = ov::PartialShape{-1, 1, -1, -1};
}

} else if (op && op->op == GGML_OP_SET_ROWS && op->src[2] == input) {
} else if (is_kvcache(input, op)) {
// kvcache
input_shape = ov::PartialShape{get_shape(input)};
if (!m_is_static) {
// do not fix ctx size to make llama-bench work
input_shape[2] = -1;
}

} else if (op && op->op == GGML_OP_SET_ROWS && op->src[1] == input) {
} else if (is_kv_idx(input, op)) {
// kv update index
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
input_shape = ov::PartialShape{1, 1, 1, len};
Expand Down Expand Up @@ -490,9 +489,7 @@ const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name
std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const {
std::map<std::string, std::string> kv_param_res_names;
for (const auto & name : m_model_params.kv_names) {
if (name.find("cache_k") == 0 || name.find("cache_v") == 0) {
kv_param_res_names[name] = name;
}
kv_param_res_names[name] = name;
}
return kv_param_res_names;
}
Expand Down
28 changes: 28 additions & 0 deletions ggml/src/ggml-openvino/ggml-decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,34 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
static std::string compute_op_type(const ggml_tensor * node);
void add_extra_inputs();

inline static bool is_inp_tok(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE;
}

inline static bool is_inp_pos(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_ROPE && tensor == op->src[1];
}

inline static bool is_inp_emb(const ggml_tensor * tensor, const ggml_tensor * op) {
return tensor->op == GGML_OP_GET_ROWS && op->op == GGML_OP_RMS_NORM;
}

inline static bool is_inp_mask(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]);
}

inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_SET_ROWS && op->src[2] == tensor;
}

inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_SET_ROWS && op->src[1] == tensor;
}

inline static bool is_output_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op != GGML_OP_NONE;
}

private:
void set_input_output(ggml_tensor * node, bool naive = false);
int compute_op_case(const ggml_tensor * node) const;
Expand Down
3 changes: 2 additions & 1 deletion ggml/src/ggml-openvino/ggml-openvino-extra.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ void ggml_openvino_device_config::init() {
// Release the context (queue keeps a reference)
clReleaseContext(cl_ctx);
} else if (device_name == "NPU") {
remote_context = ov_singleton_core().get_default_context(device_name);
// remote tensor is not used for NPU yet
// remote_context = ov_singleton_core().get_default_context(device_name);
}

initialized = true;
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-openvino/ggml-openvino.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;

// Put kvcache on device memory for GPU (NPU memory is too small even for kvcache)
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY && strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote &&
ggml_openvino_get_device_name() == "GPU" && !getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) {
if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" &&
Copy link
Collaborator Author

@wine99 wine99 Feb 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the only place where tensor names appear outside ggml/src/ggml-openvino/openvino.

Inside ggml/src/ggml-openvino/openvino, translate_sessions.cpp still has a few tensor names.

@zhaixuejun1993 @ynimmaga @cavusmustafa

!getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) {
GGML_ASSERT(ctx->tensor_extras.empty());
auto device = ctx->device;
auto size = ctx->size;
Expand Down
18 changes: 9 additions & 9 deletions ggml/src/ggml-openvino/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -508,8 +508,8 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml
const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name);
const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor);

if (param_name == "inp_pos" || param_name == "inp_tokens" ||
(op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) {
if (GgmlOvDecoder::is_inp_tok(ggml_tensor, op) || GgmlOvDecoder::is_inp_pos(ggml_tensor, op) ||
GgmlOvDecoder::is_kv_idx(ggml_tensor, op)) {
assert(ggml_tensor->ne[0] == 1);
ov::Shape input_shape = {1, 1, 1, 1};
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
Expand All @@ -523,7 +523,7 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml
return input_tensor;
}

if (param_name == "inp_out_ids") {
if (GgmlOvDecoder::is_output_idx(ggml_tensor, op)) {
ov::Shape input_shape = {1, 1, 1, 1};
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
int32_t inp_out_id = *((int32_t *) ggml_tensor->data);
Expand All @@ -533,7 +533,7 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml
return input_tensor;
}

if (param_name.find("self_kq_mask") == 0) {
if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) {
size_t context_size = ggml_decoder->get_ctx_size();
std::vector<float> padded_data = pad_input<float>(ggml_tensor, 1, context_size, -INFINITY);
ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size});
Expand All @@ -557,8 +557,8 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggm
const size_t chunk_valid_size = std::min(chunk_size, input_len - chunk_index * chunk_size);
const size_t chunk_pad_size = chunk_size - chunk_valid_size;

if (param_name == "inp_pos" || param_name == "inp_tokens" ||
(op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) {
if (GgmlOvDecoder::is_inp_tok(ggml_tensor, op) || GgmlOvDecoder::is_inp_pos(ggml_tensor, op) ||
GgmlOvDecoder::is_kv_idx(ggml_tensor, op)) {
ov::Shape input_shape = {1, 1, 1, chunk_size};
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
// copy the chunk_index-th chunk from ggml_tensor
Expand All @@ -585,7 +585,7 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggm
return input_tensor;
}

if (param_name == "inp_out_ids") {
if (GgmlOvDecoder::is_output_idx(ggml_tensor, op)) {
size_t output_len = ggml_decoder->get_compute_params().output_len;
ov::Shape input_shape = {1, 1, 1, output_len};
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
Expand All @@ -600,7 +600,7 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggm
return input_tensor;
}

if (param_name.find("self_kq_mask") == 0) {
if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) {
size_t cols = ggml_tensor->ne[0];
size_t rows = ggml_tensor->ne[1];
float * ggml_data = (float *) ggml_tensor->data + chunk_index * chunk_size * cols;
Expand Down Expand Up @@ -748,7 +748,7 @@ const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) {
if (src == nullptr) {
break;
}
if (std::string(src->name) == "inp_pos") {
if (GgmlOvDecoder::is_inp_pos(src, op)) {
return src;
}
}
Expand Down