diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index f759e2d5883..b5d81e2e735 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -683,7 +683,9 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 
-        char padding[8];
+        char padding[16];
+        // add a struct ggml_tensor * named org_src, initialized to NULL, for keeping track of original source tensors in case of in-place operations
+        struct ggml_tensor * org_src;
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 22c656996cc..8323c2e4b65 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1269,6 +1269,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
                                 ggml_set_input(tensor_copy);
                                 ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
                             }
+                            tensor_copy->org_src = src;
                             tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
                             SET_CAUSE(tensor_copy, "4.cpy");
                         }
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 8796c23abd3..9cdf8ad10f9 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -71,6 +71,13 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
         set_input_output(cur_node);
     }
 
+    m_is_full_model = has_inp_tokens && has_output;
+    if (!m_is_full_model) {
+        compute_cgraph_dynamic_dims();
+        add_extra_model_inputs_for_fallback();
+        add_extra_model_outputs_for_fallback();
+    }
+
     for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
         m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
         m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
@@ -168,6 +175,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
         current_node_info.node_inputs[src_name] = src;
         current_node_info.node_inputs_names.push_back(src_name);
 
+        if (is_inp_tok(src, node)) {
+            has_inp_tokens = true;
+        }
+
         // Add model inputs
         if (!naive && !src->view_src) {
             ggml_backend_buffer * buffer = src->buffer;
@@ -209,6 +220,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
     if (!naive) {
         // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches
         static std::set<std::string> debug_output_names = {};
+        if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
+            has_output = true;
+        }
         // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
         if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT ||
             debug_output_names.count(node_output_name)) {
@@ -297,6 +311,9 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
                 throw std::runtime_error("Unsupported VIEW case");
             }
             op_case = 2;
+            if (!m_is_full_model && m_model_inputs.find(std::string(src->name)) != m_model_inputs.end()) {
+                op_case = 0;
+            }
         }
         break;
     }
@@ -375,6 +392,15 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
         if (node->op == GGML_OP_ROPE) {
             memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15);
         }
+        if (node->op == GGML_OP_ROPE) {
+            memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15);
+        }
+    }
+    auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1];
+    compute_params.output_len = output_tensor->ne[1];
+    // for NPU, output_len is always 1 except for llama-perplexity
+    if (is_static && compute_params.output_len == 0) {
+        compute_params.output_len = 1;
     }
     auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1];
     compute_params.output_len = output_tensor->ne[1];
@@ -393,7 +419,7 @@ void GgmlOvDecoder::validate_cgraph() const {
     }
 }
 
-ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const {
+ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index) const {
     auto name = std::string(input->name);
     ov::PartialShape input_shape;
 
@@ -432,6 +458,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
     } else {
         input_shape = ov::PartialShape{get_shape(input)};
     }
+    if (dynamic_dim_index != -1) {
+        input_shape[3-dynamic_dim_index] = -1;
+    }
     return input_shape;
 }
 
@@ -614,6 +643,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
     if (!is_ov_buffer) {
         return ov_weight.weight_node;
     }
+    ggml_openvino_buffer_register_extra(tensor, extra);
 
     ggml_openvino_extra_base * extra;
     if (ov_weight.is_quantized()) {
@@ -914,3 +944,221 @@ const std::string & GgmlOvDecoder::get_op_type() const {
     static const std::string unknown_op = "UNKNOWN_GGML_OP";
     return unknown_op;
 }
+
+/**
+ * @brief Computes the dynamic dimensions for the computation graph nodes to support fallback mechanisms.
+ *
+ * This function traverses the computation graph and determines the dynamic dimensions
+ * for each node based on its operation type and dependencies. The dynamic dimension
+ * is stored in the `m_node_dynamic_dims` map, where a value of -1 indicates no dynamic
+ * dimension. Specific operations such as GGML_OP_GET_ROWS, GGML_OP_MUL, GGML_OP_VIEW,
+ * etc., are handled to compute the dynamic dimension index.
+ *
+ * Key behaviors:
+ * - Nodes with operations like GGML_OP_NONE, GGML_OP_GET_ROWS, GGML_OP_MUL, and others
+ *   are analyzed to determine their dynamic dimensions.
+ * - Nodes with specific names (e.g., "inp_tokens", "inp_pos", "inp_out_ids") are
+ *   explicitly assigned a dynamic dimension index of 0.
+ * - For operations like GGML_OP_VIEW and GGML_OP_RESHAPE, the function ensures that
+ *   the dynamic dimension is uniquely determined; otherwise, a warning is printed.
+ * - Unhandled operations print a message indicating the node name and operation type.
+ *
+ * This function is critical for preparing the computation graph for execution, ensuring
+ * that dynamic dimensions are correctly propagated and resolved.
+ */
+void GgmlOvDecoder::compute_cgraph_dynamic_dims() {
+    auto visit_node = [&](auto && self, ggml_tensor * node) -> void {
+        if (!node) {
+            return;
+        }
+
+        if (node->op == GGML_OP_CPY) {
+            m_node_dynamic_dims[node] = -1;
+        }
+
+        if (m_node_dynamic_dims.count(node)) {
+            return;
+        }
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            ggml_tensor * src = node->src[i];
+            if (src == nullptr) {
+                continue;
+            }
+            if (src->org_src) {
+                if (is_inp_tok(src->org_src, node) || is_inp_pos(src->org_src, node) || is_output_idx(src->org_src, node)) {
+                    m_node_dynamic_dims[src->org_src] = 0;
+                    m_node_dynamic_dims[src] = m_node_dynamic_dims[src->org_src];
+                    continue;
+                }
+                self(self, src->org_src);
+                m_node_dynamic_dims[src] = m_node_dynamic_dims[src->org_src];
+            } else {
+                if (is_inp_tok(src, node) || is_inp_pos(src, node) || is_output_idx(src, node)) {
+                    m_node_dynamic_dims[src] = 0;
+                    continue;
+                }
+                self(self, src);
+            }
+        }
+        switch (node->op) {
+        case GGML_OP_NONE:
+            m_node_dynamic_dims[node] = -1;
+            // if (std::string(node->name) == "inp_tokens" || std::string(node->name) == "inp_pos" ||
+            //     std::string(node->name) == "inp_out_ids") {
+            //     m_node_dynamic_dims[node] = 0;
+            // }
+            break;
+        case GGML_OP_GET_ROWS:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[1]] != -1) {
+                m_node_dynamic_dims[node] = 1;
+            }
+            break;
+        case GGML_OP_MUL:
+        case GGML_OP_MUL_MAT:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
+            }
+            if (m_node_dynamic_dims[node->src[1]] != -1) {
+                m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
+            }
+            break;
+        case GGML_OP_VIEW:
+        case GGML_OP_FLASH_ATTN_EXT:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_RESHAPE:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
+                auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx];
+                int same_dim_count = 0;
+                for (int i = 0; i < 4; i++) {
+                    if (node->ne[i] == dynamic_dim_value) {
+                        m_node_dynamic_dims[node] = i;
+                        same_dim_count++;
+                    }
+                }
+                if (same_dim_count != 1) {
+                    std::cout << "Cannot determine dynamic dim for node: " << node->name << std::endl;
+                }
+            }
+            break;
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_ADD:
+        case GGML_OP_GLU:
+        case GGML_OP_ROPE:
+        case GGML_OP_SCALE:
+            m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
+            break;
+        case GGML_OP_CPY:
+        case GGML_OP_SET_ROWS:
+            m_node_dynamic_dims[node] = -1;
+            break;
+        default:
+            std::cout << "Doesn't handle node name: " << node->name << " op: " << ggml_op_name(node->op) << std::endl;
+            break;
+        }
+    };
+
+    for (int i = 0; i < m_cgraph->n_nodes; i++) {
+        ggml_tensor * node = m_cgraph->nodes[i];
+        visit_node(visit_node, node);
+    }
+}
+
+/**
+ * @brief Adds extra model outputs to support fallback mechanisms.
+ *
+ * This function ensures that all relevant nodes in the computation graph are included
+ * as model outputs for fallback scenarios. It creates a mapping of tensor data addresses
+ * to their corresponding nodes, excluding nodes with the GGML_OP_VIEW operation.
+ *
+ * Key behaviors:
+ * - Iterates through all nodes in the computation graph and maps their data addresses
+ *   to the corresponding tensor nodes, skipping nodes with GGML_OP_VIEW.
+ * - Adds nodes to the `m_model_outputs` map if they are not already present, using
+ *   the tensor's name as the key.
+ *
+ * This function is essential for ensuring that fallback mechanisms have access to all
+ * necessary model outputs, particularly in scenarios where certain outputs are not
+ * explicitly defined in the original model configuration.
+ */
+void GgmlOvDecoder::add_extra_model_outputs_for_fallback() { 
+    std::map<void *, ggml_tensor *> address_map;
+    for (int i = 0; i < m_cgraph->n_nodes; i++) {
+        ggml_tensor * node = m_cgraph->nodes[i];
+
+        if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
+            return;
+        }
+
+        if (node->op == GGML_OP_VIEW) {
+            continue;
+        }
+        address_map[node->data] = node;
+    }
+
+    for (const auto & pair : address_map) {
+        const std::string & name = pair.second->name;
+        if (m_model_outputs.find(name) == m_model_outputs.end()) {
+            m_model_outputs[name] = pair.second;
+        }
+    }
+}
+
+/**
+* @brief Adds extra model inputs to support fallback mechanisms.
+*
+* This function ensures that all necessary input nodes in the computation graph are
+* included as model inputs for fallback scenarios. It iterates through the source nodes
+* of each computation graph node and adds them to the `m_model_inputs` map if they meet
+* specific criteria.
+*
+* Key behaviors:
+* - Skips source nodes that are already present in `m_model_weights` or `m_model_inputs`.
+* - Excludes intermediate nodes that are part of `m_node_info_list`.
+* - For eligible source nodes, creates OpenVINO parameter nodes with appropriate types
+*   and shapes, and assigns them friendly names.
+* - Updates the `m_inputs` and `m_model_inputs` maps with the new parameter nodes.
+*
+* This function is critical for ensuring that fallback mechanisms have access to all
+* required model inputs, particularly in scenarios where certain inputs are not
+* explicitly defined in the original model configuration.
+*/
+void GgmlOvDecoder::add_extra_model_inputs_for_fallback() {
+    for (int i = 0; i < m_cgraph->n_nodes; i++) {
+        ggml_tensor * node = m_cgraph->nodes[i];
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            auto * src = node->src[i];
+            if (src == nullptr) {
+                continue;
+            }
+            std::string src_name = std::string(src->name);
+            if (m_model_weights.find(src_name) != m_model_weights.end()) {
+                continue;
+            }
+
+            bool is_intermediate_node = false;
+            for (const auto & node_info : m_node_info_list) {
+                if (node_info.node == src) {
+                    is_intermediate_node = true;
+                    break;
+                }
+            }
+            if (is_intermediate_node) {
+                continue;
+            }
+            if (m_model_inputs.find(src_name) != m_model_inputs.end()) {
+                continue;
+            }
+
+            m_inputs[src_name] = src;
+            auto param_node = std::make_shared<ov::op::v0::Parameter>(
+                get_ov_type(src), get_graph_input_shape(node, src, m_node_dynamic_dims[src]));
+            param_node->set_friendly_name(src_name);
+            param_node->output(0).get_tensor().set_names({src_name});
+            m_model_inputs[src_name] = param_node;
+        }
+    }
+}
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
index 59311a61214..b4315f1a0db 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -180,7 +180,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     virtual bool is_stateful() const override { return m_is_stateful; }
 
-    ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const;
+    ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index=-1) const;
 
     static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
 
@@ -204,9 +204,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     void set_compute_params(const ComputeParams & compute_params) { m_compute_params = compute_params; }
 
+    virtual bool is_full_model() const override {return m_is_full_model; }
+
     bool m_is_static = false;
     bool m_is_stateful = false;
     bool m_is_prefill = false;
+    bool m_is_full_model = true; // label the cgraph is splited or not
     int m_prefill_chunk_size = 0;
 
     static ov::Shape get_shape(const ggml_tensor * tensor);
@@ -268,6 +271,13 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     void set_input_output(ggml_tensor * node, bool naive = false);
     int compute_op_case(const ggml_tensor * node) const;
 
+    // @brief Computes the dynamic dimensions for the computation graph nodes to support fallback mechanisms.
+    void compute_cgraph_dynamic_dims();
+    // @brief Adds extra model outputs to support fallback mechanisms.
+    void add_extra_model_outputs_for_fallback();
+    // @brief Adds extra model inputs to support fallback mechanisms.
+    void add_extra_model_inputs_for_fallback();
+
     void validate_cgraph() const;
 
     ggml_cgraph * m_cgraph = nullptr;
@@ -280,6 +290,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     std::map<std::string, ggml_tensor *> m_model_outputs;
     std::vector<NodeInfo> m_node_info_list;
 
+    std::map<ggml_tensor *, int>
+        m_node_dynamic_dims;  // map from ggml_tensor to its dynamic dimension index, -1 means static
+    bool has_inp_tokens = false;
+    bool has_output = false;
+
     ModelParams m_model_params;
     ComputeParams m_compute_params;
 };
diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp
index 3b8da2be5d2..1fe4ea6c811 100644
--- a/ggml/src/ggml-openvino/openvino/decoder.hpp
+++ b/ggml/src/ggml-openvino/openvino/decoder.hpp
@@ -53,6 +53,8 @@ class GgmlDecoder : public DecoderBase {
 
     virtual int get_op_case(int node_idx) const = 0;
 
+    virtual bool is_full_model() const = 0;
+
     virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const = 0;
     virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
     virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const = 0;
diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp
index 286229dc0e3..acd71f5844e 100644
--- a/ggml/src/ggml-openvino/openvino/translate_session.cpp
+++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@@ -78,17 +78,16 @@ ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs(
 }
 
 void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
-    auto token_len_per_seq = tensor_map.at("token_len_per_seq").get_node_shared_ptr();
-
     auto create_sliced_mask = [&](const std::string & mask_name, const std::string & sliced_name, bool is_static) {
-        if (tensor_map.find(mask_name) != tensor_map.end()) {
+        if ((tensor_map.find(mask_name) != tensor_map.end()) && (tensor_map.find("token_len_per_seq") != tensor_map.end())){
+            auto token_len_per_seq = tensor_map.at("token_len_per_seq").get_node_shared_ptr();
             auto mask = tensor_map.at(mask_name).get_node_shared_ptr();
             std::shared_ptr<ov::Node> mask_sliced;
             if (is_static) {
                 mask_sliced = mask;
             } else if (ggml_model_decoder.is_stateful()) {
-                auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
-                auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
+                auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 0});
+                auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 1});
                 auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
                 auto three_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
                 auto neg_one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
@@ -129,14 +128,15 @@ void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder)
         rope_freqs_weight = tensor_map.at("rope_freqs.weight").get_node_shared_ptr();
     }
 
-    auto sin_cos = make_sin_cos(rope_params, inp_pos, rope_freqs_weight);
-    auto sin_theta = sin_cos.first;
-    auto cos_theta = sin_cos.second;
+        auto sin_cos = make_sin_cos(rope_params, inp_pos, rope_freqs_weight);
+        auto sin_theta = sin_cos.first;
+        auto cos_theta = sin_cos.second;
 
-    cos_theta.get_node_shared_ptr()->set_friendly_name("rope_cos");
-    sin_theta.get_node_shared_ptr()->set_friendly_name("rope_sin");
-    tensor_map.insert({"rope_cos", cos_theta});
-    tensor_map.insert({"rope_sin", sin_theta});
+        cos_theta.get_node_shared_ptr()->set_friendly_name("rope_cos");
+        sin_theta.get_node_shared_ptr()->set_friendly_name("rope_sin");
+        tensor_map.insert({"rope_cos", cos_theta});
+        tensor_map.insert({"rope_sin", sin_theta});
+    }
 }
 
 // Create common patterns
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index a370043dd77..2831cb05a0c 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -4,6 +4,7 @@
 #include "ggml-openvino-extra.h"
 #include "ggml-openvino/ggml-decoder.h"
 #include "ggml.h"
+#include "ggml-cpu.h"
 #include "openvino/frontend.hpp"
 #include "openvino/input_model.hpp"
 
@@ -95,7 +96,9 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
         if (cache_hit) {
             ggml_decoder = it->second;
             old_m_params = ggml_decoder->get_model_params();
-            cache_hit = old_m_params.can_reuse_dynamically(m_params);
+            if (ggml_decoder->is_full_model()) {
+                cache_hit = old_m_params.can_reuse_dynamically(m_params);
+            }
         }
 
         if (cache_hit) {
@@ -437,7 +440,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
 }
 
 bool is_naive(ggml_cgraph * cgraph) {
-    constexpr int naive_graph_size_threshold = 20;
+    constexpr int naive_graph_size_threshold = 0;
     int count = 0;
     for (int i = 0; i < cgraph->n_nodes; i++) {
         if (cgraph->nodes[i]->op != GGML_OP_NONE) {
@@ -497,7 +500,7 @@ namespace {
 ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & name) {
     const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
 
-    if (ggml_tensor->extra != nullptr) {
+    if (ggml_tensor->extra != nullptr && ggml_decoder->is_full_model()) {
         // GGML_LOG_DEBUG("Using ggml_tensor->extra as ov::Tensor for input: %s\n", name.c_str());
         auto * extra_base = static_cast<ggml_openvino_extra_base *>(ggml_tensor->extra);
         if (extra_base->type != ggml_openvino_extra_base::Type::TENSOR) {
@@ -510,12 +513,76 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
     // GGML_LOG_DEBUG("Converting ggml tensor to ov::Tensor for input: %s\n", name.c_str());
     auto * input_data = ggml_tensor->data;
     ov::Shape input_shape;
-    if (ggml_tensor->op == GGML_OP_VIEW) {
+    if (0) {
         // This case is added to make test-backend-ops work
         input_shape = ggml_decoder->get_shape(ggml_tensor->view_src);
     } else {
         input_shape = ggml_decoder->get_shape(ggml_tensor);
     }
+
+    // If the tensor is a result of PERMUTE operation, use ggml_cont to make it contiguous
+    if (ggml_tensor->op == GGML_OP_PERMUTE && !ggml_decoder->is_full_model()) {
+        // Create a temporary context for ggml_cont operation
+        // Need space for: tensor overhead, tensor data, graph structure, and work buffer
+        size_t mem_size = ggml_tensor_overhead() * 4 + ggml_nbytes(ggml_tensor) * 2 + 1024 * 1024;
+        struct ggml_init_params params = {
+            /*.mem_size   =*/mem_size,
+            /*.mem_buffer =*/NULL,
+            /*.no_alloc   =*/false,
+        };
+        struct ggml_context * temp_ctx = ggml_init(params);
+        if (temp_ctx == NULL) {
+            throw std::runtime_error("Failed to initialize temporary context for PERMUTE");
+        }
+
+        // Create contiguous tensor using ggml_cont
+        struct ggml_tensor * cont_tensor = ggml_cont(temp_ctx, const_cast<struct ggml_tensor *>(ggml_tensor));
+
+        // Build a simple graph to compute ggml_cont
+        struct ggml_cgraph * gf = ggml_new_graph(temp_ctx);
+        ggml_build_forward_expand(gf, cont_tensor);
+        ggml_graph_compute_with_ctx(temp_ctx, gf, 1);
+
+        // Create OpenVINO tensor with contiguous data
+        ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
+        memcpy(input_tensor.data(), cont_tensor->data, ggml_nbytes(cont_tensor));
+
+        // Free temporary context
+        ggml_free(temp_ctx);
+
+        return input_tensor;
+    }
+
+    // If the tensor is a result of VIEW operation, use ggml_cont to make it contiguous
+    if (ggml_tensor->op == GGML_OP_VIEW && !ggml_decoder->is_full_model()) {
+        // if the ggml_tensor shape size is equal to the source tensor shape size, no need to reconstruct the ov input tensor data
+        if (ggml_nelements(ggml_tensor) == ggml_nelements(ggml_tensor->view_src)) {
+            auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data);
+            return input_tensor;
+        }
+
+        // Create OpenVINO input tensor, the data need to reconstructed based on the view tensor shape & stride
+        // Todo: parallel copy & the copy the whole last dim one loop (perf improve)
+        ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
+        const auto * src_tensor = ggml_tensor->view_src;
+        size_t des_index = 0;
+        for (size_t i0 = 0; i0 < static_cast<size_t>(ggml_tensor->ne[3]); i0++) {
+            for (size_t i1 = 0; i1 < static_cast<size_t>(ggml_tensor->ne[2]); i1++) {
+                for (size_t i2 = 0; i2 < static_cast<size_t>(ggml_tensor->ne[1]); i2++) {
+                    for (size_t i3 = 0; i3 < static_cast<size_t>(ggml_tensor->ne[0]); i3++) {
+                        size_t src_index = ggml_tensor->view_offs + i0 * ggml_tensor->nb[3] + i1 * ggml_tensor->nb[2] +
+                                           i2 * ggml_tensor->nb[1] + i3 * ggml_tensor->nb[0];
+
+                        memcpy(static_cast<char *>(input_tensor.data()) + des_index,
+                               static_cast<const char *>(src_tensor->data) + src_index, ggml_tensor->nb[0]);
+                        des_index += ggml_tensor->nb[0];
+                    }
+                }
+            }
+        }
+        return input_tensor;
+    }
+
     auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data);
     return input_tensor;
 }