Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 20 additions & 15 deletions ggml/src/ggml-openvino/ggml-decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -508,10 +508,10 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const

std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) {
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
static std::mutex weights_mutex;
// static std::mutex weights_mutex;
auto * nodes = cgraph->nodes;
auto n_nodes = cgraph->n_nodes;
std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
std::for_each(std::execution::seq, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
for (int i = 0; i < GGML_MAX_SRC; i++) {
auto * src = node->src[i];
if (src == nullptr) {
Expand All @@ -522,21 +522,26 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
if (!src->view_src) {
ggml_backend_buffer * buffer = src->buffer;
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) {
bool should_create = false;
{
std::lock_guard<std::mutex> lock(weights_mutex);
if (model_weights.find(src_name) == model_weights.end()) {
model_weights[src_name] = nullptr;
should_create = true;
}
}
if (should_create) {
// bool should_create = false;
// {
// std::lock_guard<std::mutex> lock(weights_mutex);
// if (model_weights.find(src_name) == model_weights.end()) {
// model_weights[src_name] = nullptr;
// should_create = true;
// }
// }
// if (should_create) {
// auto weight_node = create_weight_node(src);
// weight_node->set_friendly_name(src_name);
// {
// std::lock_guard<std::mutex> lock(weights_mutex);
// model_weights[src_name] = weight_node;
// }
// }
if (model_weights.find(src_name) == model_weights.end()) {
auto weight_node = create_weight_node(src);
weight_node->set_friendly_name(src_name);
{
std::lock_guard<std::mutex> lock(weights_mutex);
model_weights[src_name] = weight_node;
}
model_weights[src_name] = weight_node;
}
}
}
Expand Down
29 changes: 16 additions & 13 deletions ggml/src/ggml-openvino/ggml-openvino-extra.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,12 +209,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
layout.is_requant = true;
layout.requant_type = requant_type;

// Special case: requant to F16 - just store F16 weights, no scales/biases
// Special case: requant to F16 - just store F16 weights, no scales/zp
if (requant_type.value() == ExtraQuantType::F16) {
layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes
layout.total_size = layout.weights_size;
layout.weights_offset = 0;
// No scales/biases for F16
// No scales/zp for F16
return layout;
}

Expand Down Expand Up @@ -255,14 +255,15 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
int64_t n_blocks = n_elements / layout.weights_per_block;
layout.scales_size = n_blocks * sizeof(uint16_t);
// For symmetric quantization, we only need one bias value (not one per block)
layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t);
// For symmetric quantization, we only need one zp value (not one per block)
// Zero points are stored in U4 or U8 format matching the weight type
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;

layout.weights_offset = 0;
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
layout.biases_offset =
layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
layout.total_size = layout.biases_offset + layout.biases_size;
layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
layout.total_size = layout.zp_offset + layout.zp_size;
layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
return layout;
}
Expand Down Expand Up @@ -305,17 +306,19 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
// Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;

// Scales and biases: F16 per block
// Scales: F16 per block
int64_t n_blocks = n_elements / layout.weights_per_block;
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
// For symmetric quantization, we only need one bias value (not one per block)
layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t);
// Zero points: U4 or U8 matching weight type
// For symmetric quantization, we only need one zp value (not one per block)
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;

// Layout in buffer: [weights | scales | biases] with alignment
// Layout in buffer: [weights | scales | zp] with alignment
layout.weights_offset = 0;
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
layout.total_size = layout.biases_offset + layout.biases_size;
layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
layout.total_size = layout.zp_offset + layout.zp_size;

return layout;
}
Expand Down
21 changes: 12 additions & 9 deletions ggml/src/ggml-openvino/ggml-openvino-extra.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,16 +110,19 @@ struct ggml_openvino_weight_extra : public ggml_openvino_extra_base {
: ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {}
};

// Extra data for quantized weight tensors - stores extracted weights/scales/biases and ov::Constant
// Extra data for quantized weight tensors - stores extracted weights/scales/zp and ov::Constant
struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {
ov::Tensor weights; // U4 or U8 extracted weights
ov::Tensor scales; // F16 scales
ov::Tensor biases; // F16 biases (zero points)
ov::Tensor zp; // U4 or U8 zero points (same type as weights)
std::shared_ptr<ov::Node> constant; // Pre-built OpenVINO weight subgraph

ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor b, std::shared_ptr<ov::Node> c)
: ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),
weights(std::move(w)), scales(std::move(s)), biases(std::move(b)), constant(std::move(c)) {}
ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> c) :
ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),
weights(std::move(w)),
scales(std::move(s)),
zp(std::move(z)),
constant(std::move(c)) {}
};

// Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request
Expand All @@ -133,7 +136,7 @@ struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base {
// =====================================================
// Extracted Size Calculation for Quantized Tensors
// =====================================================
// For quantized tensors, we need extra space to store extracted weights, scales, and biases.
// For quantized tensors, we need extra space to store extracted weights, scales, and zero points.
// Returns the total size needed in the buffer for extracted data.

struct ggml_openvino_extracted_layout {
Expand All @@ -142,10 +145,10 @@ struct ggml_openvino_extracted_layout {
size_t weights_size; // Size of weights in bytes
size_t scales_offset; // Offset to scales in buffer
size_t scales_size; // Size of scales in bytes
size_t biases_offset; // Offset to biases in buffer
size_t biases_size; // Size of biases in bytes
size_t zp_offset; // Offset to zero points in buffer
size_t zp_size; // Size of zero points in bytes (U4 or U8)
bool is_u4; // true for U4 weights, false for U8
int64_t weights_per_block;// weights per scale/bias block
int64_t weights_per_block; // weights per scale/zp block
bool is_symmetric; // true for symmetric quantization

// Requantization info
Expand Down
13 changes: 7 additions & 6 deletions ggml/src/ggml-openvino/ggml-openvino.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,13 +259,15 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};
// zp shape: scalar for symmetric, per-block for asymmetric
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;

ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset);
ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
ov::Tensor zp(weight_type, zp_shape, buf_base + layout.zp_offset);

auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
std::move(biases), constant);
std::move(zp), constant);
ctx->tensor_extras[tensor] = extra;
tensor->extra = extra;

Expand Down Expand Up @@ -487,10 +489,9 @@ static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buff
if (ggml_is_quantized(tensor->type) && tensor->ne[2] == 1 && tensor->ne[3] == 1) {
ggml_openvino_extracted_layout layout = ggml_openvino_get_extracted_layout(tensor);
if (layout.total_size > 0) {
GGML_LOG_DEBUG(
"%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu biases=%zu)\n",
__func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size, layout.scales_size,
layout.biases_size);
GGML_LOG_DEBUG("%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu zp=%zu)\n",
__func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size,
layout.scales_size, layout.zp_size);
return layout.total_size;
}
}
Expand Down
Loading