From 40d2bb23a14d4d2a2b06a29ee4fa02f45f23b23d Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Thu, 26 Feb 2026 15:55:24 -0800
Subject: [PATCH 1/3] Removed static variables from utils.cpp

---
 ggml/src/ggml-openvino/ggml-openvino-extra.h | 28 ++++++
 ggml/src/ggml-openvino/ggml-openvino.cpp     | 32 +------
 ggml/src/ggml-openvino/utils.cpp             | 99 ++++++++++----------
 ggml/src/ggml-openvino/utils.h               | 24 ++++-
 4 files changed, 98 insertions(+), 85 deletions(-)
diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h
index 441a62e9d30..292c57212be 100644
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.h
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h
@@ -167,3 +167,31 @@ ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor
 // Register an extra with the tensor's OpenVINO buffer context for proper lifetime management.
 // This sets tensor->extra and tracks the extra in the buffer context for cleanup.
 void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra);
+
+// =====================================================
+// OpenVINO Backend Context and Interface
+// =====================================================
+struct ggml_backend_openvino_context {
+    int device;               // the device ID currently in use
+    std::string name;         // context Name
+    std::string description;  // context description
+
+    // OpenVINO runtime context
+    std::shared_ptr<void> ov_runtime_context;
+
+    // OpenVINO Multi-stream support
+    static const int MAX_STREAMS = 8;       // define the maximum number of flows
+    std::vector<ov::InferRequest> streams;  // used to support multi-stream reasoning
+    int current_stream;                     // the currently active stream index
+
+    // state Management
+    bool is_initialized;  // initialize
+
+    ggml_backend_openvino_context() :
+        device(0),
+        name("OpenVINO"),
+        description("OpenVINO Backend Context"),
+        current_stream(0),
+        ov_runtime_context(nullptr),
+        is_initialized(false) {}
+};
diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index 948ff2cc780..d117be6024d 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -593,36 +593,6 @@ bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) {
     return buft->iface.get_name == ggml_backend_openvino_host_buffer_type_get_name;
 }
 
-// =====================================================
-// OpenVINO Backend Context and Interface
-// =====================================================
-
-struct ggml_backend_openvino_context {
-    int device;               // the device ID currently in use
-    std::string name;         // context Name
-    std::string description;  // context description
-
-    // OpenVINO core components
-    ov::Core core;                             // OpenVINO core interface
-    std::shared_ptr<ov::CompiledModel> model;  // compiled Model
-    ov::InferRequest infer_request;            // inference Request
-
-    // OpenVINO Multi-stream support
-    static const int MAX_STREAMS = 8;       // define the maximum number of flows
-    std::vector<ov::InferRequest> streams;  // used to support multi-stream reasoning
-    int current_stream;                     // the currently active stream index
-
-    // state Management
-    bool is_initialized;  // initialize
-
-    ggml_backend_openvino_context() :
-        device(0),
-        name("OpenVINO"),
-        description("OpenVINO Backend Context"),
-        current_stream(0),
-        is_initialized(false) {}
-};
-
 static void ggml_backend_openvino_free(ggml_backend_t backend) {
     ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context;
     delete ctx;
@@ -635,7 +605,7 @@ static const char * ggml_backend_openvino_get_name(ggml_backend_t backend) {
 }
 
 static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    return ov_graph_compute(cgraph);
+    return ov_graph_compute(cgraph, backend);
     GGML_UNUSED(backend);
 }
 
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index a3b7887ae58..07afd34d7ca 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -37,22 +37,27 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 
-enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) {
+enum ggml_status ov_graph_compute(ggml_cgraph * cgraph, ggml_backend_t backend) {
+    ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context;
     try {
         if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
             std::string filename = "cgraph_ov.txt";
             GgmlOvDecoder::dump_cgraph(cgraph, filename);
         }
 
-        // Use device from singleton (initialized during backend init)
-        const auto & device = ggml_openvino_get_device_name();
         const auto is_static = ggml_openvino_is_npu();
-        bool stateful = false;
+
+        if (ctx->ov_runtime_context == nullptr) {
+            ctx->ov_runtime_context = std::make_shared<ov_runtime_context>();
+        }
+	std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->ov_runtime_context);
+        r_ctx->device = ggml_openvino_get_device_name();
+        r_ctx->stateful = false;
         if (getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !is_static) {
-            stateful = true;
+            r_ctx->stateful = true;
         }
 
-        return is_static ? ov_graph_compute_static(cgraph) : ov_graph_compute_dynamic(cgraph, device, stateful);
+        return is_static ? ov_graph_compute_static(cgraph, r_ctx) : ov_graph_compute_dynamic(cgraph, r_ctx);
     } catch (const ov::Exception & e) {
         GGML_LOG_ERROR("GGML OpenVINO backend ov::Exception: %s\n", e.what());
         return GGML_STATUS_FAILED;
@@ -65,11 +70,12 @@ enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) {
     }
 }
 
-enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::string & device, bool stateful) {
+enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<ov_runtime_context> r_ctx) {
     auto & core = ov_singleton_core();
     const auto & config = ggml_openvino_get_compile_config();
+    auto device = r_ctx->device;
+    bool stateful = r_ctx->stateful;
     static auto is_static = false;
-    static size_t stateful_kv_size = 0;
 
     if (is_naive(cgraph)) {
         return naive_compute(cgraph, core, device, config);
@@ -77,12 +83,6 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
 
     auto start_time = ggml_time_us();
 
-    static std::mutex cache_mutex;
-    static std::unordered_map<graph_key, std::shared_ptr<GgmlOvDecoder>, graph_key_hash> decoder_cache;
-    static std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache;
-    static std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_input_names_cache;
-    static std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_output_names_cache;
-
     std::shared_ptr<GgmlOvDecoder> ggml_decoder;
     std::shared_ptr<ov::InferRequest> infer_request;
     ModelParams m_params;
@@ -98,11 +98,11 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
     int64_t infer_end_time;
 
     {
-        std::lock_guard<std::mutex> lock(cache_mutex);
+        std::lock_guard<std::mutex> lock(r_ctx->cache_mutex);
 
-        auto it = decoder_cache.find(key);
+        auto it = r_ctx->decoder_cache.find(key);
 
-        cache_hit = it != decoder_cache.end();
+        cache_hit = it != r_ctx->decoder_cache.end();
         ModelParams old_m_params;
         if (cache_hit) {
             ggml_decoder = it->second;
@@ -118,7 +118,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
                 ggml_decoder->update_io(cgraph);
             }
             ggml_decoder->add_extra_inputs();
-            infer_request = infer_request_cache.at(key);
+            infer_request = r_ctx->infer_request_cache.at(key);
 
             if (stateful) {
                 const auto * inp_pos = get_inp_pos_tensor(cgraph);
@@ -126,9 +126,9 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
                 auto pos_shape = ggml_decoder->get_shape(inp_pos);
                 if (pos_data[0] == 0) {
                     infer_request->reset_state();
-                    stateful_kv_size = pos_shape[3];
-                } else if (stateful_kv_size == static_cast<size_t>(pos_data[0])) {
-                    stateful_kv_size += pos_shape[3];
+                    r_ctx->stateful_kv_size = pos_shape[3];
+                } else if (r_ctx->stateful_kv_size == static_cast<size_t>(pos_data[0])) {
+                    r_ctx->stateful_kv_size += pos_shape[3];
                 } else {
                     auto states = infer_request->query_state();
                     for (auto state : states) {
@@ -138,7 +138,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
                         ov::Tensor new_state_tensor(state_tensor, begin, end);
                         state.set_state(new_state_tensor);
                     }
-                    stateful_kv_size = pos_data[0] + 1;
+                    r_ctx->stateful_kv_size = pos_data[0] + 1;
                 }
             }
 
@@ -146,7 +146,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
             conversion_end_time = decoder_end_time;
             compile_end_time = decoder_end_time;
         } else {
-            infer_request_cache.erase(key);
+            r_ctx->infer_request_cache.erase(key);
 
             std::shared_ptr<ov::Model> model;
             auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
@@ -176,8 +176,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
             }
             compile_end_time = ggml_time_us();
             infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
-            infer_request_cache[key] = infer_request;
-            decoder_cache[key] = ggml_decoder;
+            r_ctx->infer_request_cache[key] = infer_request;
+            r_ctx->decoder_cache[key] = ggml_decoder;
 
             std::vector<std::string> ov_input_names;
             std::vector<std::string> ov_output_names;
@@ -187,12 +187,16 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
             for (const auto & ov_output : model->get_results()) {
                 ov_output_names.push_back(ov_output->get_friendly_name());
             }
-            ov_input_names_cache[key] = std::move(ov_input_names);
-            ov_output_names_cache[key] = std::move(ov_output_names);
+            r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
+            r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
+
+	    if (stateful) {
+                r_ctx->stateful_kv_size = 0;
+            }
         }
 
-        auto ov_input_names = ov_input_names_cache[key];
-        auto ov_output_names = ov_output_names_cache[key];
+        auto ov_input_names = r_ctx->ov_input_names_cache[key];
+        auto ov_output_names = r_ctx->ov_output_names_cache[key];
 
         for (size_t i = 0; i < ov_input_names.size(); i++) {
             auto param_name = ov_input_names[i];
@@ -233,7 +237,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
     return GGML_STATUS_SUCCESS;
 }
 
-enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
+enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<ov_runtime_context> r_ctx) {
     auto & core = ov_singleton_core();
 
     auto get_prefill_chunk_size = [] {
@@ -256,13 +260,6 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
 
     auto start_time = ggml_time_us();
 
-    static std::mutex cache_mutex;
-    static std::unordered_map<graph_key, std::shared_ptr<GgmlOvDecoder>, graph_key_hash> decoder_cache;
-    static std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache;
-    static std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache_prefill;
-    static std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_input_names_cache;
-    static std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_output_names_cache;
-
     std::shared_ptr<GgmlOvDecoder> ggml_decoder;
     std::shared_ptr<ov::InferRequest> infer_request;
     ModelParams m_params;
@@ -280,11 +277,11 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
     int64_t infer_end_time;
 
     {
-        std::lock_guard<std::mutex> lock(cache_mutex);
+        std::lock_guard<std::mutex> lock(r_ctx->cache_mutex);
 
-        auto it = decoder_cache.find(key);
+        auto it = r_ctx->decoder_cache.find(key);
 
-        cache_hit = it != decoder_cache.end();
+        cache_hit = it != r_ctx->decoder_cache.end();
         ModelParams old_m_params;
         if (cache_hit) {
             ggml_decoder = it->second;
@@ -301,14 +298,14 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
                 ggml_decoder->update_io(cgraph);
             }
             ggml_decoder->add_extra_inputs();
-            infer_request = is_prefill ? infer_request_cache_prefill.at(key) : infer_request_cache.at(key);
+            infer_request = is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key);
 
             decoder_end_time = ggml_time_us();
             conversion_end_time = decoder_end_time;
             compile_end_time = decoder_end_time;
         } else {
-            infer_request_cache.erase(key);
-            infer_request_cache_prefill.erase(key);
+            r_ctx->infer_request_cache.erase(key);
+            r_ctx->infer_request_cache_prefill.erase(key);
 
             std::shared_ptr<ov::Model> model;
             auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
@@ -348,15 +345,15 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
                 compiled_model_decode = core.compile_model(model_decode, device, config);
             }
 
-            infer_request_cache_prefill[key] =
+            r_ctx->infer_request_cache_prefill[key] =
                 std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
-            infer_request_cache[key] = std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
+            r_ctx->infer_request_cache[key] = std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
             compile_end_time = ggml_time_us();
 
             model = is_prefill ? model_prefill : model_decode;
             ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode;
-            infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key];
-            decoder_cache[key] = ggml_decoder;
+            infer_request = is_prefill ? r_ctx->infer_request_cache_prefill[key] : r_ctx->infer_request_cache[key];
+            r_ctx->decoder_cache[key] = ggml_decoder;
 
             std::vector<std::string> ov_input_names;
             std::vector<std::string> ov_output_names;
@@ -366,13 +363,13 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
             for (const auto & ov_output : model->get_results()) {
                 ov_output_names.push_back(ov_output->get_friendly_name());
             }
-            ov_input_names_cache[key] = std::move(ov_input_names);
-            ov_output_names_cache[key] = std::move(ov_output_names);
+            r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
+            r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
         }
     }
 
-    auto ov_input_names = ov_input_names_cache[key];
-    auto ov_output_names = ov_output_names_cache[key];
+    auto ov_input_names = r_ctx->ov_input_names_cache[key];
+    auto ov_output_names = r_ctx->ov_output_names_cache[key];
 
     if (is_prefill) {
         auto inp_len = inp_pos->ne[0];
diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h
index 7c403b7d890..c9caa507f84 100644
--- a/ggml/src/ggml-openvino/utils.h
+++ b/ggml/src/ggml-openvino/utils.h
@@ -36,10 +36,28 @@ struct graph_key_hash {
     }
 };
 
-enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph);
+struct ov_runtime_context {
+    std::string device;
+    bool stateful;
+    size_t stateful_kv_size;
+    std::mutex cache_mutex;
+    std::unordered_map<graph_key, std::shared_ptr<GgmlOvDecoder>, graph_key_hash> decoder_cache;
+    std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache;
+    std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache_prefill;
+    std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_input_names_cache;
+    std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_output_names_cache;
+
+    ov_runtime_context() :
+        ov_core(ov_singleton_core()),
+        device("CPU"),
+        stateful(false),
+        stateful_kv_size(0) {}
+};
+
+enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph, ggml_backend_t backend);
 
-enum ggml_status ov_graph_compute_dynamic(struct ggml_cgraph * cgraph, const std::string & device, bool stateful = false);
-enum ggml_status ov_graph_compute_static(struct ggml_cgraph * cgraph);
+enum ggml_status ov_graph_compute_dynamic(struct ggml_cgraph * cgraph, std::shared_ptr<ov_runtime_context> r_ctx);
+enum ggml_status ov_graph_compute_static(struct ggml_cgraph * cgraph, std::shared_ptr<ov_runtime_context> r_ctx);
 
 size_t checksum(const void * data, size_t size);
 

From 41179c09d7177401d97bc8dedf725bd2ec310080 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Thu, 26 Feb 2026 16:08:05 -0800
Subject: [PATCH 2/3] Removed initializing non-existing variable

---
 ggml/src/ggml-openvino/utils.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h
index c9caa507f84..fe4e0c68fdc 100644
--- a/ggml/src/ggml-openvino/utils.h
+++ b/ggml/src/ggml-openvino/utils.h
@@ -48,7 +48,6 @@ struct ov_runtime_context {
     std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_output_names_cache;
 
     ov_runtime_context() :
-        ov_core(ov_singleton_core()),
         device("CPU"),
         stateful(false),
         stateful_kv_size(0) {}

From 252ef84bec8796521fba3cda27677baf3a50ec2b Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Fri, 27 Feb 2026 11:38:35 +0800
Subject: [PATCH 3/3] Remove unused structs

---
 ggml/include/ggml-openvino.h                 | 27 +-----------
 ggml/src/ggml-openvino/ggml-openvino-extra.h | 29 ++++---------
 ggml/src/ggml-openvino/ggml-openvino.cpp     | 43 +++++++++-----------
 ggml/src/ggml-openvino/utils.cpp             | 11 +----
 ggml/src/ggml-openvino/utils.h               |  4 ++
 5 files changed, 33 insertions(+), 81 deletions(-)

diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h
index b68b55d1e81..c43beb07b6a 100644
--- a/ggml/include/ggml-openvino.h
+++ b/ggml/include/ggml-openvino.h
@@ -1,17 +1,14 @@
 #pragma once
 
 #include "ggml-backend.h"
-#include "ggml.h"
 
-#include <array>
 #include <cstring>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define GGML_OPENVINO_NAME        "OPENVINO"
-#define GGML_OPENVINO_MAX_DEVICES 16
+#define GGML_OPENVINO_NAME "OPENVINO"
 
 // backend API
 GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device);
@@ -35,28 +32,6 @@ GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);
 
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void);
 
-struct ggml_openvino_device_info {
-    int device_count;
-
-    struct openvino_device_info {
-        int    cc;               // compute capability
-        int    nsm;              // number of streaming multiprocessors
-        size_t smpb;             // max. shared memory per block
-        size_t smpbo;            // max. shared memory per block (with opt-in)
-        bool   vmm;              // virtual memory support
-        size_t vmm_granularity;  // granularity of virtual memory
-        size_t total_vram;
-    };
-
-    openvino_device_info devices[GGML_OPENVINO_MAX_DEVICES] = {};
-
-    std::array<float, GGML_OPENVINO_MAX_DEVICES> default_tensor_split = {};
-};
-
 #ifdef __cplusplus
 }
 #endif
-
-#ifdef __cplusplus
-const ggml_openvino_device_info & ggml_openvino_info();
-#endif
diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h
index 292c57212be..cd0baf4a681 100644
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.h
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h
@@ -172,26 +172,11 @@ void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_ext
 // OpenVINO Backend Context and Interface
 // =====================================================
 struct ggml_backend_openvino_context {
-    int device;               // the device ID currently in use
-    std::string name;         // context Name
-    std::string description;  // context description
-
-    // OpenVINO runtime context
-    std::shared_ptr<void> ov_runtime_context;
-
-    // OpenVINO Multi-stream support
-    static const int MAX_STREAMS = 8;       // define the maximum number of flows
-    std::vector<ov::InferRequest> streams;  // used to support multi-stream reasoning
-    int current_stream;                     // the currently active stream index
-
-    // state Management
-    bool is_initialized;  // initialize
-
-    ggml_backend_openvino_context() :
-        device(0),
-        name("OpenVINO"),
-        description("OpenVINO Backend Context"),
-        current_stream(0),
-        ov_runtime_context(nullptr),
-        is_initialized(false) {}
+    int device = 0;
+    std::string name = "OpenVINO";
+    std::string description = "OpenVINO Backend Context";
+
+    std::shared_ptr<void> runtime_context = nullptr;
+
+    ggml_backend_openvino_context() = default;
 };
diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index d117be6024d..478f26e90e6 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -9,6 +9,7 @@
 #include "ggml.h"
 
 #include <atomic>
+#include <cstdlib>
 #include <cstdint>
 #include <cstring>
 #include <memory>
@@ -627,7 +628,7 @@ static const ggml_backend_i ggml_backend_openvino_interface = {
 };
 
 int ggml_backend_openvino_get_device_count() {
-    return ggml_openvino_info().device_count;
+    return 1;
 }
 
 static ggml_guid_t ggml_backend_openvino_guid(void) {
@@ -649,6 +650,17 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
         return nullptr;
     }
 
+    ctx->runtime_context = std::make_shared<ov_runtime_context>();
+    if (ctx->runtime_context == nullptr) {
+        GGML_LOG_ERROR("%s: failed to allocate runtime context\n", __func__);
+        delete ctx;
+        return nullptr;
+    }
+
+    std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
+    r_ctx->device = ggml_openvino_get_device_name();
+    r_ctx->stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !ggml_openvino_is_npu();
+
     ggml_backend_t openvino_backend = new ggml_backend{
         /* .guid      = */ ggml_backend_openvino_guid(),
         /* .interface = */ ggml_backend_openvino_interface,
@@ -1026,7 +1038,7 @@ static const char * ggml_backend_openvino_reg_get_name(ggml_backend_reg_t reg) {
 
 static size_t ggml_backend_openvino_reg_get_device_count(ggml_backend_reg_t reg) {
     GGML_UNUSED(reg);
-    return ggml_openvino_info().device_count;
+    return (size_t) ggml_backend_openvino_get_device_count();
 }
 
 static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_t reg, size_t index) {
@@ -1035,36 +1047,17 @@ static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_
     return ctx->devices[index];
 }
 
-static void * ggml_backend_openvino_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    GGML_UNUSED(reg);
-    GGML_UNUSED(name);
-    return nullptr;
-}
-
 static const struct ggml_backend_reg_i ggml_backend_openvino_reg_interface = {
     /* .get_name         = */ ggml_backend_openvino_reg_get_name,
     /* .get_device_count = */ ggml_backend_openvino_reg_get_device_count,
     /* .get_device       = */ ggml_backend_openvino_reg_get_device,
-    /* .get_proc_address = */ ggml_backend_openvino_get_proc_address,
+    /* .get_proc_address = */ NULL,
 };
 
-static int get_openvino_device_count() {
-    return 1;
-}
-
-static ggml_openvino_device_info ggml_openvino_init() {
+static void ggml_openvino_init() {
     // Initialize device config singleton from env var
     ggml_openvino_init_device_config();
     GGML_LOG_INFO("OpenVINO: using device %s\n", ggml_openvino_get_device_name().c_str());
-
-    ggml_openvino_device_info info = {};
-    info.device_count = get_openvino_device_count();
-    return info;
-}
-
-const ggml_openvino_device_info & ggml_openvino_info() {
-    static ggml_openvino_device_info info = ggml_openvino_init();
-    return info;
 }
 
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void) {
@@ -1075,9 +1068,11 @@ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void) {
         static std::mutex mutex;
         std::lock_guard<std::mutex> lock(mutex);
         if (!initialized) {
+            ggml_openvino_init();
+
             ggml_backend_openvino_reg_context * ctx = new ggml_backend_openvino_reg_context;
 
-            for (int i = 0; i < ggml_openvino_info().device_count; i++) {
+            for (int i = 0; i < ggml_backend_openvino_get_device_count(); i++) {
                 ggml_backend_openvino_device_context * dev_ctx = new ggml_backend_openvino_device_context;
                 dev_ctx->device = i;
                 dev_ctx->name = GGML_OPENVINO_NAME + std::to_string(i);
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index 07afd34d7ca..05478cbc3e6 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -47,15 +47,8 @@ enum ggml_status ov_graph_compute(ggml_cgraph * cgraph, ggml_backend_t backend)
 
         const auto is_static = ggml_openvino_is_npu();
 
-        if (ctx->ov_runtime_context == nullptr) {
-            ctx->ov_runtime_context = std::make_shared<ov_runtime_context>();
-        }
-	std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->ov_runtime_context);
-        r_ctx->device = ggml_openvino_get_device_name();
-        r_ctx->stateful = false;
-        if (getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !is_static) {
-            r_ctx->stateful = true;
-        }
+        GGML_ASSERT(ctx->runtime_context != nullptr);
+        std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
 
         return is_static ? ov_graph_compute_static(cgraph, r_ctx) : ov_graph_compute_dynamic(cgraph, r_ctx);
     } catch (const ov::Exception & e) {
diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h
index fe4e0c68fdc..ebc69ad7f66 100644
--- a/ggml/src/ggml-openvino/utils.h
+++ b/ggml/src/ggml-openvino/utils.h
@@ -4,8 +4,12 @@
 
 #include <algorithm>
 #include <cstddef>
+#include <memory>
+#include <mutex>
 #include <openvino/runtime/core.hpp>
 #include <string>
+#include <unordered_map>
+#include <vector>
 
 struct graph_key {
     int n_nodes;