Skip to content
11 changes: 7 additions & 4 deletions ggml/src/ggml-openvino/openvino/translate_session.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <openvino/op/broadcast.hpp>
#include <openvino/op/concat.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/convert_like.hpp>
#include <openvino/op/cos.hpp>
#include <openvino/op/divide.hpp>
#include <openvino/op/gather.hpp>
Expand Down Expand Up @@ -89,12 +90,14 @@ void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
auto three_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
auto neg_one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {-2,-1});
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
auto shape_of_inp_pos = std::make_shared<ov::op::v3::ShapeOf>(inp_pos);
auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(shape_of_inp_pos, two_1d, zero_1d);
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len_per_seq, gather_inp_pos}, 0);
auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(inp_pos, neg_one_1d, three_1d);
auto reshaped_inp_pos = std::make_shared<ov::op::v1::Reshape>(gather_inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), false);
auto inp_pos_incremented = std::make_shared<ov::op::v1::Add>(reshaped_inp_pos, ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {1}));
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len_per_seq, std::make_shared<v1::ConvertLike>(inp_pos_incremented, token_len_per_seq)}, 0);
mask_sliced =
std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
Expand Down
18 changes: 17 additions & 1 deletion ggml/src/ggml-openvino/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
auto & core = ov_singleton_core();
const auto & config = ggml_openvino_get_compile_config();
static auto is_static = false;
static size_t stateful_kv_size = 0;

// if (is_naive(cgraph)) {
// return naive_compute(cgraph, core, device, config);
Expand Down Expand Up @@ -106,12 +107,27 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
}
ggml_decoder->add_extra_inputs();
infer_request = infer_request_cache.at(key);

if (stateful) {
const auto * inp_pos = get_inp_pos_tensor(cgraph);
int32_t * pos_data = (int32_t *) inp_pos->data;
auto pos_shape = ggml_decoder->get_shape(inp_pos);
if (pos_data[0] == 0) {
infer_request->reset_state();
}
stateful_kv_size = pos_shape[3];
} else if (stateful_kv_size == pos_data[0]) {
stateful_kv_size += pos_shape[3];
} else {
auto states = infer_request->query_state();
for (auto state : states) {
auto state_tensor = state.get_state();
ov::Coordinate begin = {0, 0, 0, 0};
ov::Coordinate end = {state_tensor.get_shape()[0], static_cast<uint32_t>(pos_data[0]), state_tensor.get_shape()[2], state_tensor.get_shape()[3]};
ov::Tensor new_state_tensor(state_tensor, begin, end);
state.set_state(new_state_tensor);
}
stateful_kv_size = pos_data[0] + 1;
}
}

decoder_end_time = ggml_time_us();
Expand Down