Extract u4/u8 zero point directly instead of FP bias #41

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

wine99 merged 1 commit into dev_backend_openvino from extract-zp-instead-of-bias

Feb 5, 2026

+297 −280

ggml/src/ggml-openvino/ggml-decoder.cpp

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -508,10 +508,10 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
  
    std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) {

        std::map<std::string, std::shared_ptr<ov::Node>> model_weights;

        static std::mutex weights_mutex;

        // static std::mutex weights_mutex;

        auto * nodes = cgraph->nodes;

        auto n_nodes = cgraph->n_nodes;

        std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) {

        std::for_each(std::execution::seq, nodes, nodes + n_nodes, [&](ggml_tensor * node) {

            for (int i = 0; i < GGML_MAX_SRC; i++) {

                auto * src = node->src[i];

                if (src == nullptr) {

    @@ -522,21 +522,26 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
  
                if (!src->view_src) {

                    ggml_backend_buffer * buffer = src->buffer;

                    if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) {

                        bool should_create = false;

                        {

                            std::lock_guard<std::mutex> lock(weights_mutex);

                            if (model_weights.find(src_name) == model_weights.end()) {

                                model_weights[src_name] = nullptr;

                                should_create = true;

                            }

                        }

                        if (should_create) {

                        // bool should_create = false;

                        // {

                        //     std::lock_guard<std::mutex> lock(weights_mutex);

                        //     if (model_weights.find(src_name) == model_weights.end()) {

                        //         model_weights[src_name] = nullptr;

                        //         should_create = true;

                        //     }

                        // }

                        // if (should_create) {

                        //     auto weight_node = create_weight_node(src);

                        //     weight_node->set_friendly_name(src_name);

                        //     {

                        //         std::lock_guard<std::mutex> lock(weights_mutex);

                        //         model_weights[src_name] = weight_node;

                        //     }

                        // }

                        if (model_weights.find(src_name) == model_weights.end()) {

                            auto weight_node = create_weight_node(src);

                            weight_node->set_friendly_name(src_name);

                            {

                                std::lock_guard<std::mutex> lock(weights_mutex);

                                model_weights[src_name] = weight_node;

                            }

                            model_weights[src_name] = weight_node;

                        }

                    }

                }

ggml/src/ggml-openvino/ggml-openvino-extra.cpp

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -209,12 +209,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
  
            layout.is_requant = true;

            layout.requant_type = requant_type;

            // Special case: requant to F16 - just store F16 weights, no scales/biases

            // Special case: requant to F16 - just store F16 weights, no scales/zp

            if (requant_type.value() == ExtraQuantType::F16) {

                layout.weights_size = n_elements * sizeof(uint16_t);  // F16 = 2 bytes

                layout.total_size = layout.weights_size;

                layout.weights_offset = 0;

                // No scales/biases for F16

                // No scales/zp for F16

                return layout;

            }

    @@ -255,14 +255,15 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
  
                layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;

                int64_t n_blocks = n_elements / layout.weights_per_block;

                layout.scales_size = n_blocks * sizeof(uint16_t);

                // For symmetric quantization, we only need one bias value (not one per block)

                layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t);

                // For symmetric quantization, we only need one zp value (not one per block)

                // Zero points are stored in U4 or U8 format matching the weight type

                size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;

                layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;

                layout.weights_offset = 0;

                layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;

                layout.biases_offset =

                    layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;

                layout.total_size = layout.biases_offset + layout.biases_size;

                layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;

                layout.total_size = layout.zp_offset + layout.zp_size;

                layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));

                return layout;

            }

    @@ -305,17 +306,19 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
  
        // Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes

        layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;

        // Scales and biases: F16 per block

        // Scales: F16 per block

        int64_t n_blocks = n_elements / layout.weights_per_block;

        layout.scales_size = n_blocks * sizeof(uint16_t);  // F16 = 2 bytes

        // For symmetric quantization, we only need one bias value (not one per block)

        layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t);

        // Zero points: U4 or U8 matching weight type

        // For symmetric quantization, we only need one zp value (not one per block)

        size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;

        layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;

        // Layout in buffer: [weights | scales | biases] with alignment

        // Layout in buffer: [weights | scales | zp] with alignment

        layout.weights_offset = 0;

        layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;

        layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;

        layout.total_size = layout.biases_offset + layout.biases_size;

        layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;

        layout.total_size = layout.zp_offset + layout.zp_size;

        return layout;

    }

ggml/src/ggml-openvino/ggml-openvino-extra.h

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -110,16 +110,19 @@ struct ggml_openvino_weight_extra : public ggml_openvino_extra_base {
  
            : ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {}

    };

    // Extra data for quantized weight tensors - stores extracted weights/scales/biases and ov::Constant

    // Extra data for quantized weight tensors - stores extracted weights/scales/zp and ov::Constant

    struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {

        ov::Tensor weights;   // U4 or U8 extracted weights

        ov::Tensor scales;    // F16 scales

        ov::Tensor biases;    // F16 biases (zero points)

        ov::Tensor zp;        // U4 or U8 zero points (same type as weights)

        std::shared_ptr<ov::Node> constant;  // Pre-built OpenVINO weight subgraph

        ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor b, std::shared_ptr<ov::Node> c)

            : ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),

              weights(std::move(w)), scales(std::move(s)), biases(std::move(b)), constant(std::move(c)) {}

        ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> c) :

            ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),

            weights(std::move(w)),

            scales(std::move(s)),

            zp(std::move(z)),

            constant(std::move(c)) {}

    };

    // Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request

    @@ -133,7 +136,7 @@ struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base {
  
    // =====================================================

    // Extracted Size Calculation for Quantized Tensors

    // =====================================================

    // For quantized tensors, we need extra space to store extracted weights, scales, and biases.

    // For quantized tensors, we need extra space to store extracted weights, scales, and zero points.

    // Returns the total size needed in the buffer for extracted data.

    struct ggml_openvino_extracted_layout {

    @@ -142,10 +145,10 @@ struct ggml_openvino_extracted_layout {
  
        size_t weights_size;      // Size of weights in bytes

        size_t scales_offset;     // Offset to scales in buffer

        size_t scales_size;       // Size of scales in bytes

        size_t biases_offset;     // Offset to biases in buffer

        size_t biases_size;       // Size of biases in bytes

        size_t zp_offset;         // Offset to zero points in buffer

        size_t zp_size;           // Size of zero points in bytes (U4 or U8)

        bool is_u4;               // true for U4 weights, false for U8

        int64_t weights_per_block;// weights per scale/bias block

        int64_t weights_per_block;  // weights per scale/zp block

        bool is_symmetric;        // true for symmetric quantization

        // Requantization info

ggml/src/ggml-openvino/ggml-openvino.cpp

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -259,13 +259,15 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
  
                    ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};

                    ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),

                                             static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};

                    // zp shape: scalar for symmetric, per-block for asymmetric

                    ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;

                    ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset);

                    ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);

                    ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset);

                    ov::Tensor zp(weight_type, zp_shape, buf_base + layout.zp_offset);

                    auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),

                                                                            std::move(biases), constant);

                                                                            std::move(zp), constant);

                    ctx->tensor_extras[tensor] = extra;

                    tensor->extra = extra;

    @@ -487,10 +489,9 @@ static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buff
  
        if (ggml_is_quantized(tensor->type) && tensor->ne[2] == 1 && tensor->ne[3] == 1) {

            ggml_openvino_extracted_layout layout = ggml_openvino_get_extracted_layout(tensor);

            if (layout.total_size > 0) {

                GGML_LOG_DEBUG(

                    "%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu biases=%zu)\n",

                    __func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size, layout.scales_size,

                    layout.biases_size);

                GGML_LOG_DEBUG("%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu zp=%zu)\n",

                               __func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size,

                               layout.scales_size, layout.zp_size);

                return layout.total_size;

            }

        }

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Extract u4/u8 zero point directly instead of FP bias #41

Diff view

Diff view

There are no files selected for viewing

Uh oh!