Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ggml/include/ggml-openvino.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t b

GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft);

GGML_BACKEND_API size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer);

// device buffer
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device);

Expand Down
16 changes: 14 additions & 2 deletions ggml/src/ggml-openvino/ggml-decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,17 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
add_extra_inputs();
}

void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) {
m_cgraph = cgraph;
m_model_inputs.clear();
m_model_outputs.clear();
m_node_info_list.clear();
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
auto * cur_node = cgraph->nodes[node_n];
set_input_output(cur_node);
}
}

GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
m_cgraph = cgraph;
m_model_weights = model_weights;
Expand Down Expand Up @@ -330,6 +341,7 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
auto * mask = node->src[3];
std::string mask_name(mask->name);

model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer);
if (mask_name.find("swa") != std::string::npos) {
model_params.swa_layers.push_back(layer);
model_params.ctx_per_seq_swa = cache_k->ne[1];
Expand Down Expand Up @@ -358,7 +370,7 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
break;
}
if (node->op == GGML_OP_ROPE) {
model_params.rope_params = node->op_params;
memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15);
}
}
auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1];
Expand Down Expand Up @@ -405,7 +417,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
// kvcache
input_shape = ov::PartialShape{get_shape(input)};
if (!m_is_static) {
// do not fix ctx size to make llama-bench work
// do not fix ctx size to make llama-bench work across test params
input_shape[2] = -1;
}

Expand Down
20 changes: 12 additions & 8 deletions ggml/src/ggml-openvino/ggml-decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "openvino/decoder.hpp"

#include <cstdint>
#include <cstring>
#include <map>
#include <memory>
#include <openvino/core/partial_shape.hpp>
Expand All @@ -20,20 +21,21 @@ struct ModelParams {
int n_heads = -1;
int n_heads_kv = -1;
int head_size = -1;
int32_t * rope_params = nullptr;
int32_t rope_params[15];
std::vector<int> swa_layers;

std::vector<std::string> kv_names;
size_t kv_buffer_ctx_id = 0;

bool operator==(const ModelParams & other) const {
return n_seq == other.n_seq && n_heads == other.n_heads && n_heads_kv == other.n_heads_kv &&
head_size == other.head_size && rope_params == other.rope_params && swa_layers == other.swa_layers &&
ctx_per_seq == other.ctx_per_seq && ctx_per_seq_swa == other.ctx_per_seq_swa;
bool same_rope_params(const ModelParams & other) const {
return memcmp(rope_params, other.rope_params, sizeof(int32_t) * 15) == 0;
}

bool can_reuse_dynamically(const ModelParams & other) const { return *this == other; }
bool can_reuse_dynamically(const ModelParams & other) const { return same_rope_params(other); }

bool can_reuse_statically(const ModelParams & other) const { return *this == other; }
bool can_reuse_statically(const ModelParams & other) const { return same_rope_params(other) && ctx == other.ctx; }

bool kv_buffer_changed(const ModelParams & other) const { return kv_buffer_ctx_id != other.kv_buffer_ctx_id; }
};

struct ComputeParams {
Expand Down Expand Up @@ -170,7 +172,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

int get_input_len() const { return m_compute_params.input_len; }

virtual int32_t * get_rope_params() const override { return m_model_params.rope_params; }
virtual int32_t * get_rope_params() const override { return const_cast<int32_t *>(m_model_params.rope_params); }

virtual std::map<std::string, std::string> get_kv_param_res_names() const override;

Expand Down Expand Up @@ -213,6 +215,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
static std::string compute_op_type(const ggml_tensor * node);
void add_extra_inputs();

void update_io(ggml_cgraph * cgraph);

inline static bool is_inp_tok(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE;
}
Expand Down
16 changes: 16 additions & 0 deletions ggml/src/ggml-openvino/ggml-openvino.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "ggml-quants.hpp"
#include "ggml.h"

#include <atomic>
#include <cstdint>
#include <cstring>
#include <memory>
Expand Down Expand Up @@ -53,6 +54,7 @@
struct ggml_backend_openvino_buffer_context {
int device;
std::string name;
size_t id;

// For non-weight buffers (KV cache, compute), we still use contiguous allocation
void * data;
Expand All @@ -71,6 +73,10 @@ struct ggml_backend_openvino_buffer_context {
ggml_backend_openvino_buffer_context(int device, size_t size, bool is_remote = false) :
device(device),
name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)),
id([]() {
static std::atomic<size_t> next_id{1};
return next_id.fetch_add(1);
}()),
data(nullptr),
size(size),
is_remote(is_remote) {
Expand Down Expand Up @@ -107,6 +113,8 @@ struct ggml_backend_openvino_buffer_context {

~ggml_backend_openvino_buffer_context() {
// Clean up all tensor extras
GGML_LOG_DEBUG("Deleting OpenVINO buffer context #%zu for device %d, size %zu MB\n", id, device,
size / 1024 / 1024);
for (auto & pair : tensor_extras) {
delete pair.second;
}
Expand Down Expand Up @@ -587,6 +595,14 @@ bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) {
return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer;
}

size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) {
if (!ggml_backend_buffer_is_openvino(buffer)) {
return 0;
}
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
return ctx->id;
}

bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name;
}
Expand Down
38 changes: 16 additions & 22 deletions ggml/src/ggml-openvino/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
ComputeParams c_params;
std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static);

const auto key = compute_graph_key(cgraph);
graph_key key(cgraph);
bool cache_hit;

int64_t decoder_end_time;
Expand All @@ -90,19 +90,22 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
auto it = decoder_cache.find(key);

cache_hit = it != decoder_cache.end();
ModelParams old_m_params;
if (cache_hit) {
ggml_decoder = it->second;
cache_hit = ggml_decoder->get_model_params().can_reuse_dynamically(m_params);
old_m_params = ggml_decoder->get_model_params();
cache_hit = old_m_params.can_reuse_dynamically(m_params);
}

if (cache_hit) {
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
ggml_decoder = decoder_cache[key];
ggml_decoder->set_compute_params(c_params);
ggml_decoder->set_model_params(m_params);
if (old_m_params.kv_buffer_changed(m_params)) {
ggml_decoder->update_io(cgraph);
}
ggml_decoder->add_extra_inputs();
infer_request = infer_request_cache[key];

infer_request = infer_request_cache.at(key);
if (stateful) {
const auto * inp_pos = get_inp_pos_tensor(cgraph);
int32_t * pos_data = (int32_t *) inp_pos->data;
Expand Down Expand Up @@ -240,7 +243,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {

const auto * inp_pos = get_inp_pos_tensor(cgraph);
const auto is_prefill = get_is_prefill(inp_pos);
const auto key = compute_graph_key(cgraph);
graph_key key(cgraph);
bool cache_hit;

int64_t decoder_end_time;
Expand All @@ -254,19 +257,23 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
auto it = decoder_cache.find(key);

cache_hit = it != decoder_cache.end();
ModelParams old_m_params;
if (cache_hit) {
ggml_decoder = it->second;
cache_hit = ggml_decoder->get_model_params().can_reuse_statically(m_params);
old_m_params = ggml_decoder->get_model_params();
cache_hit = old_m_params.can_reuse_statically(m_params);
}

if (cache_hit) {
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
ggml_decoder = decoder_cache[key];
ggml_decoder->m_is_prefill = is_prefill;
ggml_decoder->set_model_params(m_params);
ggml_decoder->set_compute_params(c_params);
if (old_m_params.kv_buffer_changed(m_params)) {
ggml_decoder->update_io(cgraph);
}
ggml_decoder->add_extra_inputs();
infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key];
infer_request = is_prefill ? infer_request_cache_prefill.at(key) : infer_request_cache.at(key);

decoder_end_time = ggml_time_us();
conversion_end_time = decoder_end_time;
Expand Down Expand Up @@ -761,17 +768,4 @@ bool get_is_prefill(const ggml_tensor * inp_pos) {
return inp_pos->ne[0] > 1;
}

graph_key compute_graph_key(ggml_cgraph * cgraph) {
graph_key key;
key.n_nodes = cgraph->n_nodes;

for (int i = 0; i < cgraph->n_nodes; ++i) {
const auto * node = cgraph->nodes[i];
if (node->op == GGML_OP_SET_ROWS && strncmp(node->src[2]->name, "cache_k_l0", 10) == 0) {
key.cache_k_l0 = node->src[2];
}
}
return key;
}

#pragma GCC diagnostic pop
25 changes: 18 additions & 7 deletions ggml/src/ggml-openvino/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,33 @@
#include <algorithm>
#include <cstddef>
#include <openvino/runtime/core.hpp>
#include <string>

struct graph_key {
size_t n_nodes;
void * cache_k_l0;
int n_nodes;
std::string first_node_name;
std::string last_node_name;

graph_key(const ggml_cgraph * cgraph) : n_nodes(cgraph->n_nodes) {
if (n_nodes > 0) {
first_node_name = cgraph->nodes[0]->name;
last_node_name = cgraph->nodes[n_nodes - 1]->name;
}
}

bool operator==(const graph_key & other) const {
return n_nodes == other.n_nodes && cache_k_l0 == other.cache_k_l0;
return n_nodes == other.n_nodes && first_node_name == other.first_node_name &&
last_node_name == other.last_node_name;
}
};

struct graph_key_hash {
size_t operator()(const graph_key & key) const {
size_t h = std::hash<size_t>{}(key.n_nodes);
h ^= std::hash<void *>{}(key.cache_k_l0) + 0x9e3779b9 + (h << 6) + (h >> 2);
size_t h = std::hash<int>{}(key.n_nodes);
if (key.n_nodes > 0) {
h ^= std::hash<std::string>{}(key.first_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2);
h ^= std::hash<std::string>{}(key.last_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2);
}
return h;
}
};
Expand Down Expand Up @@ -66,8 +79,6 @@ const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph);

bool get_is_prefill(const ggml_tensor * inp_pos);

graph_key compute_graph_key(struct ggml_cgraph * cgraph);

ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name);
ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
const std::string & param_name);
Expand Down