From 40d2bb23a14d4d2a2b06a29ee4fa02f45f23b23d Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Thu, 26 Feb 2026 15:55:24 -0800 Subject: [PATCH 1/3] Removed static variables from utils.cpp --- ggml/src/ggml-openvino/ggml-openvino-extra.h | 28 ++++++ ggml/src/ggml-openvino/ggml-openvino.cpp | 32 +------ ggml/src/ggml-openvino/utils.cpp | 99 ++++++++++---------- ggml/src/ggml-openvino/utils.h | 24 ++++- 4 files changed, 98 insertions(+), 85 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index 441a62e9d30..292c57212be 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -167,3 +167,31 @@ ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor // Register an extra with the tensor's OpenVINO buffer context for proper lifetime management. // This sets tensor->extra and tracks the extra in the buffer context for cleanup. void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra); + +// ===================================================== +// OpenVINO Backend Context and Interface +// ===================================================== +struct ggml_backend_openvino_context { + int device; // the device ID currently in use + std::string name; // context Name + std::string description; // context description + + // OpenVINO runtime context + std::shared_ptr ov_runtime_context; + + // OpenVINO Multi-stream support + static const int MAX_STREAMS = 8; // define the maximum number of flows + std::vector streams; // used to support multi-stream reasoning + int current_stream; // the currently active stream index + + // state Management + bool is_initialized; // initialize + + ggml_backend_openvino_context() : + device(0), + name("OpenVINO"), + description("OpenVINO Backend Context"), + current_stream(0), + ov_runtime_context(nullptr), + is_initialized(false) {} +}; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 948ff2cc780..d117be6024d 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -593,36 +593,6 @@ bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) { return buft->iface.get_name == ggml_backend_openvino_host_buffer_type_get_name; } -// ===================================================== -// OpenVINO Backend Context and Interface -// ===================================================== - -struct ggml_backend_openvino_context { - int device; // the device ID currently in use - std::string name; // context Name - std::string description; // context description - - // OpenVINO core components - ov::Core core; // OpenVINO core interface - std::shared_ptr model; // compiled Model - ov::InferRequest infer_request; // inference Request - - // OpenVINO Multi-stream support - static const int MAX_STREAMS = 8; // define the maximum number of flows - std::vector streams; // used to support multi-stream reasoning - int current_stream; // the currently active stream index - - // state Management - bool is_initialized; // initialize - - ggml_backend_openvino_context() : - device(0), - name("OpenVINO"), - description("OpenVINO Backend Context"), - current_stream(0), - is_initialized(false) {} -}; - static void ggml_backend_openvino_free(ggml_backend_t backend) { ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context; delete ctx; @@ -635,7 +605,7 @@ static const char * ggml_backend_openvino_get_name(ggml_backend_t backend) { } static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - return ov_graph_compute(cgraph); + return ov_graph_compute(cgraph, backend); GGML_UNUSED(backend); } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index a3b7887ae58..07afd34d7ca 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -37,22 +37,27 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" -enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) { +enum ggml_status ov_graph_compute(ggml_cgraph * cgraph, ggml_backend_t backend) { + ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context; try { if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { std::string filename = "cgraph_ov.txt"; GgmlOvDecoder::dump_cgraph(cgraph, filename); } - // Use device from singleton (initialized during backend init) - const auto & device = ggml_openvino_get_device_name(); const auto is_static = ggml_openvino_is_npu(); - bool stateful = false; + + if (ctx->ov_runtime_context == nullptr) { + ctx->ov_runtime_context = std::make_shared(); + } + std::shared_ptr r_ctx = std::static_pointer_cast(ctx->ov_runtime_context); + r_ctx->device = ggml_openvino_get_device_name(); + r_ctx->stateful = false; if (getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !is_static) { - stateful = true; + r_ctx->stateful = true; } - return is_static ? ov_graph_compute_static(cgraph) : ov_graph_compute_dynamic(cgraph, device, stateful); + return is_static ? ov_graph_compute_static(cgraph, r_ctx) : ov_graph_compute_dynamic(cgraph, r_ctx); } catch (const ov::Exception & e) { GGML_LOG_ERROR("GGML OpenVINO backend ov::Exception: %s\n", e.what()); return GGML_STATUS_FAILED; @@ -65,11 +70,12 @@ enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) { } } -enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::string & device, bool stateful) { +enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr r_ctx) { auto & core = ov_singleton_core(); const auto & config = ggml_openvino_get_compile_config(); + auto device = r_ctx->device; + bool stateful = r_ctx->stateful; static auto is_static = false; - static size_t stateful_kv_size = 0; if (is_naive(cgraph)) { return naive_compute(cgraph, core, device, config); @@ -77,12 +83,6 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin auto start_time = ggml_time_us(); - static std::mutex cache_mutex; - static std::unordered_map, graph_key_hash> decoder_cache; - static std::unordered_map, graph_key_hash> infer_request_cache; - static std::unordered_map, graph_key_hash> ov_input_names_cache; - static std::unordered_map, graph_key_hash> ov_output_names_cache; - std::shared_ptr ggml_decoder; std::shared_ptr infer_request; ModelParams m_params; @@ -98,11 +98,11 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin int64_t infer_end_time; { - std::lock_guard lock(cache_mutex); + std::lock_guard lock(r_ctx->cache_mutex); - auto it = decoder_cache.find(key); + auto it = r_ctx->decoder_cache.find(key); - cache_hit = it != decoder_cache.end(); + cache_hit = it != r_ctx->decoder_cache.end(); ModelParams old_m_params; if (cache_hit) { ggml_decoder = it->second; @@ -118,7 +118,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin ggml_decoder->update_io(cgraph); } ggml_decoder->add_extra_inputs(); - infer_request = infer_request_cache.at(key); + infer_request = r_ctx->infer_request_cache.at(key); if (stateful) { const auto * inp_pos = get_inp_pos_tensor(cgraph); @@ -126,9 +126,9 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin auto pos_shape = ggml_decoder->get_shape(inp_pos); if (pos_data[0] == 0) { infer_request->reset_state(); - stateful_kv_size = pos_shape[3]; - } else if (stateful_kv_size == static_cast(pos_data[0])) { - stateful_kv_size += pos_shape[3]; + r_ctx->stateful_kv_size = pos_shape[3]; + } else if (r_ctx->stateful_kv_size == static_cast(pos_data[0])) { + r_ctx->stateful_kv_size += pos_shape[3]; } else { auto states = infer_request->query_state(); for (auto state : states) { @@ -138,7 +138,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin ov::Tensor new_state_tensor(state_tensor, begin, end); state.set_state(new_state_tensor); } - stateful_kv_size = pos_data[0] + 1; + r_ctx->stateful_kv_size = pos_data[0] + 1; } } @@ -146,7 +146,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin conversion_end_time = decoder_end_time; compile_end_time = decoder_end_time; } else { - infer_request_cache.erase(key); + r_ctx->infer_request_cache.erase(key); std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); @@ -176,8 +176,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin } compile_end_time = ggml_time_us(); infer_request = std::make_shared(compiled_model.create_infer_request()); - infer_request_cache[key] = infer_request; - decoder_cache[key] = ggml_decoder; + r_ctx->infer_request_cache[key] = infer_request; + r_ctx->decoder_cache[key] = ggml_decoder; std::vector ov_input_names; std::vector ov_output_names; @@ -187,12 +187,16 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin for (const auto & ov_output : model->get_results()) { ov_output_names.push_back(ov_output->get_friendly_name()); } - ov_input_names_cache[key] = std::move(ov_input_names); - ov_output_names_cache[key] = std::move(ov_output_names); + r_ctx->ov_input_names_cache[key] = std::move(ov_input_names); + r_ctx->ov_output_names_cache[key] = std::move(ov_output_names); + + if (stateful) { + r_ctx->stateful_kv_size = 0; + } } - auto ov_input_names = ov_input_names_cache[key]; - auto ov_output_names = ov_output_names_cache[key]; + auto ov_input_names = r_ctx->ov_input_names_cache[key]; + auto ov_output_names = r_ctx->ov_output_names_cache[key]; for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; @@ -233,7 +237,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin return GGML_STATUS_SUCCESS; } -enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { +enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr r_ctx) { auto & core = ov_singleton_core(); auto get_prefill_chunk_size = [] { @@ -256,13 +260,6 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { auto start_time = ggml_time_us(); - static std::mutex cache_mutex; - static std::unordered_map, graph_key_hash> decoder_cache; - static std::unordered_map, graph_key_hash> infer_request_cache; - static std::unordered_map, graph_key_hash> infer_request_cache_prefill; - static std::unordered_map, graph_key_hash> ov_input_names_cache; - static std::unordered_map, graph_key_hash> ov_output_names_cache; - std::shared_ptr ggml_decoder; std::shared_ptr infer_request; ModelParams m_params; @@ -280,11 +277,11 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { int64_t infer_end_time; { - std::lock_guard lock(cache_mutex); + std::lock_guard lock(r_ctx->cache_mutex); - auto it = decoder_cache.find(key); + auto it = r_ctx->decoder_cache.find(key); - cache_hit = it != decoder_cache.end(); + cache_hit = it != r_ctx->decoder_cache.end(); ModelParams old_m_params; if (cache_hit) { ggml_decoder = it->second; @@ -301,14 +298,14 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { ggml_decoder->update_io(cgraph); } ggml_decoder->add_extra_inputs(); - infer_request = is_prefill ? infer_request_cache_prefill.at(key) : infer_request_cache.at(key); + infer_request = is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key); decoder_end_time = ggml_time_us(); conversion_end_time = decoder_end_time; compile_end_time = decoder_end_time; } else { - infer_request_cache.erase(key); - infer_request_cache_prefill.erase(key); + r_ctx->infer_request_cache.erase(key); + r_ctx->infer_request_cache_prefill.erase(key); std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); @@ -348,15 +345,15 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { compiled_model_decode = core.compile_model(model_decode, device, config); } - infer_request_cache_prefill[key] = + r_ctx->infer_request_cache_prefill[key] = std::make_shared(compiled_model_prefill.create_infer_request()); - infer_request_cache[key] = std::make_shared(compiled_model_decode.create_infer_request()); + r_ctx->infer_request_cache[key] = std::make_shared(compiled_model_decode.create_infer_request()); compile_end_time = ggml_time_us(); model = is_prefill ? model_prefill : model_decode; ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode; - infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key]; - decoder_cache[key] = ggml_decoder; + infer_request = is_prefill ? r_ctx->infer_request_cache_prefill[key] : r_ctx->infer_request_cache[key]; + r_ctx->decoder_cache[key] = ggml_decoder; std::vector ov_input_names; std::vector ov_output_names; @@ -366,13 +363,13 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { for (const auto & ov_output : model->get_results()) { ov_output_names.push_back(ov_output->get_friendly_name()); } - ov_input_names_cache[key] = std::move(ov_input_names); - ov_output_names_cache[key] = std::move(ov_output_names); + r_ctx->ov_input_names_cache[key] = std::move(ov_input_names); + r_ctx->ov_output_names_cache[key] = std::move(ov_output_names); } } - auto ov_input_names = ov_input_names_cache[key]; - auto ov_output_names = ov_output_names_cache[key]; + auto ov_input_names = r_ctx->ov_input_names_cache[key]; + auto ov_output_names = r_ctx->ov_output_names_cache[key]; if (is_prefill) { auto inp_len = inp_pos->ne[0]; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 7c403b7d890..c9caa507f84 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -36,10 +36,28 @@ struct graph_key_hash { } }; -enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph); +struct ov_runtime_context { + std::string device; + bool stateful; + size_t stateful_kv_size; + std::mutex cache_mutex; + std::unordered_map, graph_key_hash> decoder_cache; + std::unordered_map, graph_key_hash> infer_request_cache; + std::unordered_map, graph_key_hash> infer_request_cache_prefill; + std::unordered_map, graph_key_hash> ov_input_names_cache; + std::unordered_map, graph_key_hash> ov_output_names_cache; + + ov_runtime_context() : + ov_core(ov_singleton_core()), + device("CPU"), + stateful(false), + stateful_kv_size(0) {} +}; + +enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph, ggml_backend_t backend); -enum ggml_status ov_graph_compute_dynamic(struct ggml_cgraph * cgraph, const std::string & device, bool stateful = false); -enum ggml_status ov_graph_compute_static(struct ggml_cgraph * cgraph); +enum ggml_status ov_graph_compute_dynamic(struct ggml_cgraph * cgraph, std::shared_ptr r_ctx); +enum ggml_status ov_graph_compute_static(struct ggml_cgraph * cgraph, std::shared_ptr r_ctx); size_t checksum(const void * data, size_t size); From 41179c09d7177401d97bc8dedf725bd2ec310080 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Thu, 26 Feb 2026 16:08:05 -0800 Subject: [PATCH 2/3] Removed initializing non-existing variable --- ggml/src/ggml-openvino/utils.h | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index c9caa507f84..fe4e0c68fdc 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -48,7 +48,6 @@ struct ov_runtime_context { std::unordered_map, graph_key_hash> ov_output_names_cache; ov_runtime_context() : - ov_core(ov_singleton_core()), device("CPU"), stateful(false), stateful_kv_size(0) {} From 252ef84bec8796521fba3cda27677baf3a50ec2b Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 27 Feb 2026 11:38:35 +0800 Subject: [PATCH 3/3] Remove unused structs --- ggml/include/ggml-openvino.h | 27 +----------- ggml/src/ggml-openvino/ggml-openvino-extra.h | 29 ++++--------- ggml/src/ggml-openvino/ggml-openvino.cpp | 43 +++++++++----------- ggml/src/ggml-openvino/utils.cpp | 11 +---- ggml/src/ggml-openvino/utils.h | 4 ++ 5 files changed, 33 insertions(+), 81 deletions(-) diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h index b68b55d1e81..c43beb07b6a 100644 --- a/ggml/include/ggml-openvino.h +++ b/ggml/include/ggml-openvino.h @@ -1,17 +1,14 @@ #pragma once #include "ggml-backend.h" -#include "ggml.h" -#include #include #ifdef __cplusplus extern "C" { #endif -#define GGML_OPENVINO_NAME "OPENVINO" -#define GGML_OPENVINO_MAX_DEVICES 16 +#define GGML_OPENVINO_NAME "OPENVINO" // backend API GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device); @@ -35,28 +32,6 @@ GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void); -struct ggml_openvino_device_info { - int device_count; - - struct openvino_device_info { - int cc; // compute capability - int nsm; // number of streaming multiprocessors - size_t smpb; // max. shared memory per block - size_t smpbo; // max. shared memory per block (with opt-in) - bool vmm; // virtual memory support - size_t vmm_granularity; // granularity of virtual memory - size_t total_vram; - }; - - openvino_device_info devices[GGML_OPENVINO_MAX_DEVICES] = {}; - - std::array default_tensor_split = {}; -}; - #ifdef __cplusplus } #endif - -#ifdef __cplusplus -const ggml_openvino_device_info & ggml_openvino_info(); -#endif diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index 292c57212be..cd0baf4a681 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -172,26 +172,11 @@ void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_ext // OpenVINO Backend Context and Interface // ===================================================== struct ggml_backend_openvino_context { - int device; // the device ID currently in use - std::string name; // context Name - std::string description; // context description - - // OpenVINO runtime context - std::shared_ptr ov_runtime_context; - - // OpenVINO Multi-stream support - static const int MAX_STREAMS = 8; // define the maximum number of flows - std::vector streams; // used to support multi-stream reasoning - int current_stream; // the currently active stream index - - // state Management - bool is_initialized; // initialize - - ggml_backend_openvino_context() : - device(0), - name("OpenVINO"), - description("OpenVINO Backend Context"), - current_stream(0), - ov_runtime_context(nullptr), - is_initialized(false) {} + int device = 0; + std::string name = "OpenVINO"; + std::string description = "OpenVINO Backend Context"; + + std::shared_ptr runtime_context = nullptr; + + ggml_backend_openvino_context() = default; }; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index d117be6024d..478f26e90e6 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -9,6 +9,7 @@ #include "ggml.h" #include +#include #include #include #include @@ -627,7 +628,7 @@ static const ggml_backend_i ggml_backend_openvino_interface = { }; int ggml_backend_openvino_get_device_count() { - return ggml_openvino_info().device_count; + return 1; } static ggml_guid_t ggml_backend_openvino_guid(void) { @@ -649,6 +650,17 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) { return nullptr; } + ctx->runtime_context = std::make_shared(); + if (ctx->runtime_context == nullptr) { + GGML_LOG_ERROR("%s: failed to allocate runtime context\n", __func__); + delete ctx; + return nullptr; + } + + std::shared_ptr r_ctx = std::static_pointer_cast(ctx->runtime_context); + r_ctx->device = ggml_openvino_get_device_name(); + r_ctx->stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !ggml_openvino_is_npu(); + ggml_backend_t openvino_backend = new ggml_backend{ /* .guid = */ ggml_backend_openvino_guid(), /* .interface = */ ggml_backend_openvino_interface, @@ -1026,7 +1038,7 @@ static const char * ggml_backend_openvino_reg_get_name(ggml_backend_reg_t reg) { static size_t ggml_backend_openvino_reg_get_device_count(ggml_backend_reg_t reg) { GGML_UNUSED(reg); - return ggml_openvino_info().device_count; + return (size_t) ggml_backend_openvino_get_device_count(); } static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_t reg, size_t index) { @@ -1035,36 +1047,17 @@ static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_ return ctx->devices[index]; } -static void * ggml_backend_openvino_get_proc_address(ggml_backend_reg_t reg, const char * name) { - GGML_UNUSED(reg); - GGML_UNUSED(name); - return nullptr; -} - static const struct ggml_backend_reg_i ggml_backend_openvino_reg_interface = { /* .get_name = */ ggml_backend_openvino_reg_get_name, /* .get_device_count = */ ggml_backend_openvino_reg_get_device_count, /* .get_device = */ ggml_backend_openvino_reg_get_device, - /* .get_proc_address = */ ggml_backend_openvino_get_proc_address, + /* .get_proc_address = */ NULL, }; -static int get_openvino_device_count() { - return 1; -} - -static ggml_openvino_device_info ggml_openvino_init() { +static void ggml_openvino_init() { // Initialize device config singleton from env var ggml_openvino_init_device_config(); GGML_LOG_INFO("OpenVINO: using device %s\n", ggml_openvino_get_device_name().c_str()); - - ggml_openvino_device_info info = {}; - info.device_count = get_openvino_device_count(); - return info; -} - -const ggml_openvino_device_info & ggml_openvino_info() { - static ggml_openvino_device_info info = ggml_openvino_init(); - return info; } GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { @@ -1075,9 +1068,11 @@ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { static std::mutex mutex; std::lock_guard lock(mutex); if (!initialized) { + ggml_openvino_init(); + ggml_backend_openvino_reg_context * ctx = new ggml_backend_openvino_reg_context; - for (int i = 0; i < ggml_openvino_info().device_count; i++) { + for (int i = 0; i < ggml_backend_openvino_get_device_count(); i++) { ggml_backend_openvino_device_context * dev_ctx = new ggml_backend_openvino_device_context; dev_ctx->device = i; dev_ctx->name = GGML_OPENVINO_NAME + std::to_string(i); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 07afd34d7ca..05478cbc3e6 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -47,15 +47,8 @@ enum ggml_status ov_graph_compute(ggml_cgraph * cgraph, ggml_backend_t backend) const auto is_static = ggml_openvino_is_npu(); - if (ctx->ov_runtime_context == nullptr) { - ctx->ov_runtime_context = std::make_shared(); - } - std::shared_ptr r_ctx = std::static_pointer_cast(ctx->ov_runtime_context); - r_ctx->device = ggml_openvino_get_device_name(); - r_ctx->stateful = false; - if (getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !is_static) { - r_ctx->stateful = true; - } + GGML_ASSERT(ctx->runtime_context != nullptr); + std::shared_ptr r_ctx = std::static_pointer_cast(ctx->runtime_context); return is_static ? ov_graph_compute_static(cgraph, r_ctx) : ov_graph_compute_dynamic(cgraph, r_ctx); } catch (const ov::Exception & e) { diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index fe4e0c68fdc..ebc69ad7f66 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -4,8 +4,12 @@ #include #include +#include +#include #include #include +#include +#include struct graph_key { int n_nodes;