diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index f759e2d5883..b5d81e2e735 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -683,7 +683,9 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[8]; + char padding[16]; + // add a struct ggml_tensor * named org_src, initialized to NULL, for keeping track of original source tensors in case of in-place operations + struct ggml_tensor * org_src; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 22c656996cc..8323c2e4b65 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1269,6 +1269,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra ggml_set_input(tensor_copy); ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } + tensor_copy->org_src = src; tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 8796c23abd3..9cdf8ad10f9 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -71,6 +71,13 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, set_input_output(cur_node); } + m_is_full_model = has_inp_tokens && has_output; + if (!m_is_full_model) { + compute_cgraph_dynamic_dims(); + add_extra_model_inputs_for_fallback(); + add_extra_model_outputs_for_fallback(); + } + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node); m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node); @@ -168,6 +175,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { current_node_info.node_inputs[src_name] = src; current_node_info.node_inputs_names.push_back(src_name); + if (is_inp_tok(src, node)) { + has_inp_tokens = true; + } + // Add model inputs if (!naive && !src->view_src) { ggml_backend_buffer * buffer = src->buffer; @@ -209,6 +220,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { if (!naive) { // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches static std::set debug_output_names = {}; + if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { + has_output = true; + } // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || debug_output_names.count(node_output_name)) { @@ -297,6 +311,9 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { throw std::runtime_error("Unsupported VIEW case"); } op_case = 2; + if (!m_is_full_model && m_model_inputs.find(std::string(src->name)) != m_model_inputs.end()) { + op_case = 0; + } } break; } @@ -375,6 +392,15 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr if (node->op == GGML_OP_ROPE) { memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15); } + if (node->op == GGML_OP_ROPE) { + memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15); + } + } + auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1]; + compute_params.output_len = output_tensor->ne[1]; + // for NPU, output_len is always 1 except for llama-perplexity + if (is_static && compute_params.output_len == 0) { + compute_params.output_len = 1; } auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1]; compute_params.output_len = output_tensor->ne[1]; @@ -393,7 +419,7 @@ void GgmlOvDecoder::validate_cgraph() const { } } -ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const { +ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index) const { auto name = std::string(input->name); ov::PartialShape input_shape; @@ -432,6 +458,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co } else { input_shape = ov::PartialShape{get_shape(input)}; } + if (dynamic_dim_index != -1) { + input_shape[3-dynamic_dim_index] = -1; + } return input_shape; } @@ -614,6 +643,7 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor if (!is_ov_buffer) { return ov_weight.weight_node; } + ggml_openvino_buffer_register_extra(tensor, extra); ggml_openvino_extra_base * extra; if (ov_weight.is_quantized()) { @@ -914,3 +944,221 @@ const std::string & GgmlOvDecoder::get_op_type() const { static const std::string unknown_op = "UNKNOWN_GGML_OP"; return unknown_op; } + +/** + * @brief Computes the dynamic dimensions for the computation graph nodes to support fallback mechanisms. + * + * This function traverses the computation graph and determines the dynamic dimensions + * for each node based on its operation type and dependencies. The dynamic dimension + * is stored in the `m_node_dynamic_dims` map, where a value of -1 indicates no dynamic + * dimension. Specific operations such as GGML_OP_GET_ROWS, GGML_OP_MUL, GGML_OP_VIEW, + * etc., are handled to compute the dynamic dimension index. + * + * Key behaviors: + * - Nodes with operations like GGML_OP_NONE, GGML_OP_GET_ROWS, GGML_OP_MUL, and others + * are analyzed to determine their dynamic dimensions. + * - Nodes with specific names (e.g., "inp_tokens", "inp_pos", "inp_out_ids") are + * explicitly assigned a dynamic dimension index of 0. + * - For operations like GGML_OP_VIEW and GGML_OP_RESHAPE, the function ensures that + * the dynamic dimension is uniquely determined; otherwise, a warning is printed. + * - Unhandled operations print a message indicating the node name and operation type. + * + * This function is critical for preparing the computation graph for execution, ensuring + * that dynamic dimensions are correctly propagated and resolved. + */ +void GgmlOvDecoder::compute_cgraph_dynamic_dims() { + auto visit_node = [&](auto && self, ggml_tensor * node) -> void { + if (!node) { + return; + } + + if (node->op == GGML_OP_CPY) { + m_node_dynamic_dims[node] = -1; + } + + if (m_node_dynamic_dims.count(node)) { + return; + } + for (int i = 0; i < GGML_MAX_SRC; i++) { + ggml_tensor * src = node->src[i]; + if (src == nullptr) { + continue; + } + if (src->org_src) { + if (is_inp_tok(src->org_src, node) || is_inp_pos(src->org_src, node) || is_output_idx(src->org_src, node)) { + m_node_dynamic_dims[src->org_src] = 0; + m_node_dynamic_dims[src] = m_node_dynamic_dims[src->org_src]; + continue; + } + self(self, src->org_src); + m_node_dynamic_dims[src] = m_node_dynamic_dims[src->org_src]; + } else { + if (is_inp_tok(src, node) || is_inp_pos(src, node) || is_output_idx(src, node)) { + m_node_dynamic_dims[src] = 0; + continue; + } + self(self, src); + } + } + switch (node->op) { + case GGML_OP_NONE: + m_node_dynamic_dims[node] = -1; + // if (std::string(node->name) == "inp_tokens" || std::string(node->name) == "inp_pos" || + // std::string(node->name) == "inp_out_ids") { + // m_node_dynamic_dims[node] = 0; + // } + break; + case GGML_OP_GET_ROWS: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[1]] != -1) { + m_node_dynamic_dims[node] = 1; + } + break; + case GGML_OP_MUL: + case GGML_OP_MUL_MAT: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; + } + if (m_node_dynamic_dims[node->src[1]] != -1) { + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]]; + } + break; + case GGML_OP_VIEW: + case GGML_OP_FLASH_ATTN_EXT: + case GGML_OP_PERMUTE: + case GGML_OP_RESHAPE: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]]; + auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx]; + int same_dim_count = 0; + for (int i = 0; i < 4; i++) { + if (node->ne[i] == dynamic_dim_value) { + m_node_dynamic_dims[node] = i; + same_dim_count++; + } + } + if (same_dim_count != 1) { + std::cout << "Cannot determine dynamic dim for node: " << node->name << std::endl; + } + } + break; + case GGML_OP_RMS_NORM: + case GGML_OP_ADD: + case GGML_OP_GLU: + case GGML_OP_ROPE: + case GGML_OP_SCALE: + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; + break; + case GGML_OP_CPY: + case GGML_OP_SET_ROWS: + m_node_dynamic_dims[node] = -1; + break; + default: + std::cout << "Doesn't handle node name: " << node->name << " op: " << ggml_op_name(node->op) << std::endl; + break; + } + }; + + for (int i = 0; i < m_cgraph->n_nodes; i++) { + ggml_tensor * node = m_cgraph->nodes[i]; + visit_node(visit_node, node); + } +} + +/** + * @brief Adds extra model outputs to support fallback mechanisms. + * + * This function ensures that all relevant nodes in the computation graph are included + * as model outputs for fallback scenarios. It creates a mapping of tensor data addresses + * to their corresponding nodes, excluding nodes with the GGML_OP_VIEW operation. + * + * Key behaviors: + * - Iterates through all nodes in the computation graph and maps their data addresses + * to the corresponding tensor nodes, skipping nodes with GGML_OP_VIEW. + * - Adds nodes to the `m_model_outputs` map if they are not already present, using + * the tensor's name as the key. + * + * This function is essential for ensuring that fallback mechanisms have access to all + * necessary model outputs, particularly in scenarios where certain outputs are not + * explicitly defined in the original model configuration. + */ +void GgmlOvDecoder::add_extra_model_outputs_for_fallback() { + std::map address_map; + for (int i = 0; i < m_cgraph->n_nodes; i++) { + ggml_tensor * node = m_cgraph->nodes[i]; + + if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { + return; + } + + if (node->op == GGML_OP_VIEW) { + continue; + } + address_map[node->data] = node; + } + + for (const auto & pair : address_map) { + const std::string & name = pair.second->name; + if (m_model_outputs.find(name) == m_model_outputs.end()) { + m_model_outputs[name] = pair.second; + } + } +} + +/** +* @brief Adds extra model inputs to support fallback mechanisms. +* +* This function ensures that all necessary input nodes in the computation graph are +* included as model inputs for fallback scenarios. It iterates through the source nodes +* of each computation graph node and adds them to the `m_model_inputs` map if they meet +* specific criteria. +* +* Key behaviors: +* - Skips source nodes that are already present in `m_model_weights` or `m_model_inputs`. +* - Excludes intermediate nodes that are part of `m_node_info_list`. +* - For eligible source nodes, creates OpenVINO parameter nodes with appropriate types +* and shapes, and assigns them friendly names. +* - Updates the `m_inputs` and `m_model_inputs` maps with the new parameter nodes. +* +* This function is critical for ensuring that fallback mechanisms have access to all +* required model inputs, particularly in scenarios where certain inputs are not +* explicitly defined in the original model configuration. +*/ +void GgmlOvDecoder::add_extra_model_inputs_for_fallback() { + for (int i = 0; i < m_cgraph->n_nodes; i++) { + ggml_tensor * node = m_cgraph->nodes[i]; + for (int i = 0; i < GGML_MAX_SRC; i++) { + auto * src = node->src[i]; + if (src == nullptr) { + continue; + } + std::string src_name = std::string(src->name); + if (m_model_weights.find(src_name) != m_model_weights.end()) { + continue; + } + + bool is_intermediate_node = false; + for (const auto & node_info : m_node_info_list) { + if (node_info.node == src) { + is_intermediate_node = true; + break; + } + } + if (is_intermediate_node) { + continue; + } + if (m_model_inputs.find(src_name) != m_model_inputs.end()) { + continue; + } + + m_inputs[src_name] = src; + auto param_node = std::make_shared( + get_ov_type(src), get_graph_input_shape(node, src, m_node_dynamic_dims[src])); + param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); + m_model_inputs[src_name] = param_node; + } + } +} diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 59311a61214..b4315f1a0db 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -180,7 +180,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual bool is_stateful() const override { return m_is_stateful; } - ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const; + ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index=-1) const; static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); @@ -204,9 +204,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void set_compute_params(const ComputeParams & compute_params) { m_compute_params = compute_params; } + virtual bool is_full_model() const override {return m_is_full_model; } + bool m_is_static = false; bool m_is_stateful = false; bool m_is_prefill = false; + bool m_is_full_model = true; // label the cgraph is splited or not int m_prefill_chunk_size = 0; static ov::Shape get_shape(const ggml_tensor * tensor); @@ -268,6 +271,13 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void set_input_output(ggml_tensor * node, bool naive = false); int compute_op_case(const ggml_tensor * node) const; + // @brief Computes the dynamic dimensions for the computation graph nodes to support fallback mechanisms. + void compute_cgraph_dynamic_dims(); + // @brief Adds extra model outputs to support fallback mechanisms. + void add_extra_model_outputs_for_fallback(); + // @brief Adds extra model inputs to support fallback mechanisms. + void add_extra_model_inputs_for_fallback(); + void validate_cgraph() const; ggml_cgraph * m_cgraph = nullptr; @@ -280,6 +290,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map m_model_outputs; std::vector m_node_info_list; + std::map + m_node_dynamic_dims; // map from ggml_tensor to its dynamic dimension index, -1 means static + bool has_inp_tokens = false; + bool has_output = false; + ModelParams m_model_params; ComputeParams m_compute_params; }; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 3b8da2be5d2..1fe4ea6c811 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -53,6 +53,8 @@ class GgmlDecoder : public DecoderBase { virtual int get_op_case(int node_idx) const = 0; + virtual bool is_full_model() const = 0; + virtual const std::map>& get_model_inputs() const = 0; virtual const std::map>& get_model_extra_inputs() const = 0; virtual const std::map>& get_model_weights() const = 0; diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 286229dc0e3..acd71f5844e 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -78,17 +78,16 @@ ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( } void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { - auto token_len_per_seq = tensor_map.at("token_len_per_seq").get_node_shared_ptr(); - auto create_sliced_mask = [&](const std::string & mask_name, const std::string & sliced_name, bool is_static) { - if (tensor_map.find(mask_name) != tensor_map.end()) { + if ((tensor_map.find(mask_name) != tensor_map.end()) && (tensor_map.find("token_len_per_seq") != tensor_map.end())){ + auto token_len_per_seq = tensor_map.at("token_len_per_seq").get_node_shared_ptr(); auto mask = tensor_map.at(mask_name).get_node_shared_ptr(); std::shared_ptr mask_sliced; if (is_static) { mask_sliced = mask; } else if (ggml_model_decoder.is_stateful()) { - auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); - auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); + auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 0}); + auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 1}); auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto three_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); auto neg_one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); @@ -129,14 +128,15 @@ void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) rope_freqs_weight = tensor_map.at("rope_freqs.weight").get_node_shared_ptr(); } - auto sin_cos = make_sin_cos(rope_params, inp_pos, rope_freqs_weight); - auto sin_theta = sin_cos.first; - auto cos_theta = sin_cos.second; + auto sin_cos = make_sin_cos(rope_params, inp_pos, rope_freqs_weight); + auto sin_theta = sin_cos.first; + auto cos_theta = sin_cos.second; - cos_theta.get_node_shared_ptr()->set_friendly_name("rope_cos"); - sin_theta.get_node_shared_ptr()->set_friendly_name("rope_sin"); - tensor_map.insert({"rope_cos", cos_theta}); - tensor_map.insert({"rope_sin", sin_theta}); + cos_theta.get_node_shared_ptr()->set_friendly_name("rope_cos"); + sin_theta.get_node_shared_ptr()->set_friendly_name("rope_sin"); + tensor_map.insert({"rope_cos", cos_theta}); + tensor_map.insert({"rope_sin", sin_theta}); + } } // Create common patterns diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index a370043dd77..2831cb05a0c 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -4,6 +4,7 @@ #include "ggml-openvino-extra.h" #include "ggml-openvino/ggml-decoder.h" #include "ggml.h" +#include "ggml-cpu.h" #include "openvino/frontend.hpp" #include "openvino/input_model.hpp" @@ -95,7 +96,9 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin if (cache_hit) { ggml_decoder = it->second; old_m_params = ggml_decoder->get_model_params(); - cache_hit = old_m_params.can_reuse_dynamically(m_params); + if (ggml_decoder->is_full_model()) { + cache_hit = old_m_params.can_reuse_dynamically(m_params); + } } if (cache_hit) { @@ -437,7 +440,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { } bool is_naive(ggml_cgraph * cgraph) { - constexpr int naive_graph_size_threshold = 20; + constexpr int naive_graph_size_threshold = 0; int count = 0; for (int i = 0; i < cgraph->n_nodes; i++) { if (cgraph->nodes[i]->op != GGML_OP_NONE) { @@ -497,7 +500,7 @@ namespace { ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name) { const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); - if (ggml_tensor->extra != nullptr) { + if (ggml_tensor->extra != nullptr && ggml_decoder->is_full_model()) { // GGML_LOG_DEBUG("Using ggml_tensor->extra as ov::Tensor for input: %s\n", name.c_str()); auto * extra_base = static_cast(ggml_tensor->extra); if (extra_base->type != ggml_openvino_extra_base::Type::TENSOR) { @@ -510,12 +513,76 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, // GGML_LOG_DEBUG("Converting ggml tensor to ov::Tensor for input: %s\n", name.c_str()); auto * input_data = ggml_tensor->data; ov::Shape input_shape; - if (ggml_tensor->op == GGML_OP_VIEW) { + if (0) { // This case is added to make test-backend-ops work input_shape = ggml_decoder->get_shape(ggml_tensor->view_src); } else { input_shape = ggml_decoder->get_shape(ggml_tensor); } + + // If the tensor is a result of PERMUTE operation, use ggml_cont to make it contiguous + if (ggml_tensor->op == GGML_OP_PERMUTE && !ggml_decoder->is_full_model()) { + // Create a temporary context for ggml_cont operation + // Need space for: tensor overhead, tensor data, graph structure, and work buffer + size_t mem_size = ggml_tensor_overhead() * 4 + ggml_nbytes(ggml_tensor) * 2 + 1024 * 1024; + struct ggml_init_params params = { + /*.mem_size =*/mem_size, + /*.mem_buffer =*/NULL, + /*.no_alloc =*/false, + }; + struct ggml_context * temp_ctx = ggml_init(params); + if (temp_ctx == NULL) { + throw std::runtime_error("Failed to initialize temporary context for PERMUTE"); + } + + // Create contiguous tensor using ggml_cont + struct ggml_tensor * cont_tensor = ggml_cont(temp_ctx, const_cast(ggml_tensor)); + + // Build a simple graph to compute ggml_cont + struct ggml_cgraph * gf = ggml_new_graph(temp_ctx); + ggml_build_forward_expand(gf, cont_tensor); + ggml_graph_compute_with_ctx(temp_ctx, gf, 1); + + // Create OpenVINO tensor with contiguous data + ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); + memcpy(input_tensor.data(), cont_tensor->data, ggml_nbytes(cont_tensor)); + + // Free temporary context + ggml_free(temp_ctx); + + return input_tensor; + } + + // If the tensor is a result of VIEW operation, use ggml_cont to make it contiguous + if (ggml_tensor->op == GGML_OP_VIEW && !ggml_decoder->is_full_model()) { + // if the ggml_tensor shape size is equal to the source tensor shape size, no need to reconstruct the ov input tensor data + if (ggml_nelements(ggml_tensor) == ggml_nelements(ggml_tensor->view_src)) { + auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data); + return input_tensor; + } + + // Create OpenVINO input tensor, the data need to reconstructed based on the view tensor shape & stride + // Todo: parallel copy & the copy the whole last dim one loop (perf improve) + ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); + const auto * src_tensor = ggml_tensor->view_src; + size_t des_index = 0; + for (size_t i0 = 0; i0 < static_cast(ggml_tensor->ne[3]); i0++) { + for (size_t i1 = 0; i1 < static_cast(ggml_tensor->ne[2]); i1++) { + for (size_t i2 = 0; i2 < static_cast(ggml_tensor->ne[1]); i2++) { + for (size_t i3 = 0; i3 < static_cast(ggml_tensor->ne[0]); i3++) { + size_t src_index = ggml_tensor->view_offs + i0 * ggml_tensor->nb[3] + i1 * ggml_tensor->nb[2] + + i2 * ggml_tensor->nb[1] + i3 * ggml_tensor->nb[0]; + + memcpy(static_cast(input_tensor.data()) + des_index, + static_cast(src_tensor->data) + src_index, ggml_tensor->nb[0]); + des_index += ggml_tensor->nb[0]; + } + } + } + } + return input_tensor; + } + auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data); return input_tensor; }