ravi9 · wine99 · Feb 27, 2026 · Feb 26, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h
@@ -1,17 +1,14 @@
 #pragma once
 
 #include "ggml-backend.h"
-#include "ggml.h"
 
-#include <array>
 #include <cstring>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define GGML_OPENVINO_NAME        "OPENVINO"
-#define GGML_OPENVINO_MAX_DEVICES 16
+#define GGML_OPENVINO_NAME "OPENVINO"
 
 // backend API
 GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device);
@@ -35,28 +32,6 @@ GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);
 
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void);
 
-struct ggml_openvino_device_info {
-    int device_count;
-
-    struct openvino_device_info {
-        int    cc;               // compute capability
-        int    nsm;              // number of streaming multiprocessors
-        size_t smpb;             // max. shared memory per block
-        size_t smpbo;            // max. shared memory per block (with opt-in)
-        bool   vmm;              // virtual memory support
-        size_t vmm_granularity;  // granularity of virtual memory
-        size_t total_vram;
-    };
-
-    openvino_device_info devices[GGML_OPENVINO_MAX_DEVICES] = {};
-
-    std::array<float, GGML_OPENVINO_MAX_DEVICES> default_tensor_split = {};
-};
-
 #ifdef __cplusplus
 }
 #endif
-
-#ifdef __cplusplus
-const ggml_openvino_device_info & ggml_openvino_info();
-#endif
@@ -167,3 +167,16 @@ ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor
 // Register an extra with the tensor's OpenVINO buffer context for proper lifetime management.
 // This sets tensor->extra and tracks the extra in the buffer context for cleanup.
 void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra);
+
+// =====================================================
+// OpenVINO Backend Context and Interface
+// =====================================================
+struct ggml_backend_openvino_context {
+    int device = 0;
+    std::string name = "OpenVINO";
+    std::string description = "OpenVINO Backend Context";
+
+    std::shared_ptr<void> runtime_context = nullptr;
+
+    ggml_backend_openvino_context() = default;
+};
@@ -9,6 +9,7 @@
 #include "ggml.h"
 
 #include <atomic>
+#include <cstdlib>
 #include <cstdint>
 #include <cstring>
 #include <memory>
@@ -593,36 +594,6 @@ bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) {
     return buft->iface.get_name == ggml_backend_openvino_host_buffer_type_get_name;
 }
 
-// =====================================================
-// OpenVINO Backend Context and Interface
-// =====================================================
-
-struct ggml_backend_openvino_context {
-    int device;               // the device ID currently in use
-    std::string name;         // context Name
-    std::string description;  // context description
-
-    // OpenVINO core components
-    ov::Core core;                             // OpenVINO core interface
-    std::shared_ptr<ov::CompiledModel> model;  // compiled Model
-    ov::InferRequest infer_request;            // inference Request
-
-    // OpenVINO Multi-stream support
-    static const int MAX_STREAMS = 8;       // define the maximum number of flows
-    std::vector<ov::InferRequest> streams;  // used to support multi-stream reasoning
-    int current_stream;                     // the currently active stream index
-
-    // state Management
-    bool is_initialized;  // initialize
-
-    ggml_backend_openvino_context() :
-        device(0),
-        name("OpenVINO"),
-        description("OpenVINO Backend Context"),
-        current_stream(0),
-        is_initialized(false) {}
-};
-
 static void ggml_backend_openvino_free(ggml_backend_t backend) {
     ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context;
     delete ctx;
@@ -635,7 +606,7 @@ static const char * ggml_backend_openvino_get_name(ggml_backend_t backend) {
 }
 
 static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    return ov_graph_compute(cgraph);
+    return ov_graph_compute(cgraph, backend);
     GGML_UNUSED(backend);
 }
 
@@ -657,7 +628,7 @@ static const ggml_backend_i ggml_backend_openvino_interface = {
 };
 
 int ggml_backend_openvino_get_device_count() {
-    return ggml_openvino_info().device_count;
+    return 1;
 }
 
 static ggml_guid_t ggml_backend_openvino_guid(void) {
@@ -679,6 +650,17 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
         return nullptr;
     }
 
+    ctx->runtime_context = std::make_shared<ov_runtime_context>();
+    if (ctx->runtime_context == nullptr) {
+        GGML_LOG_ERROR("%s: failed to allocate runtime context\n", __func__);
+        delete ctx;
+        return nullptr;
+    }
+
+    std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
+    r_ctx->device = ggml_openvino_get_device_name();
+    r_ctx->stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !ggml_openvino_is_npu();
+
     ggml_backend_t openvino_backend = new ggml_backend{
         /* .guid      = */ ggml_backend_openvino_guid(),
         /* .interface = */ ggml_backend_openvino_interface,
@@ -1056,7 +1038,7 @@ static const char * ggml_backend_openvino_reg_get_name(ggml_backend_reg_t reg) {
 
 static size_t ggml_backend_openvino_reg_get_device_count(ggml_backend_reg_t reg) {
     GGML_UNUSED(reg);
-    return ggml_openvino_info().device_count;
+    return (size_t) ggml_backend_openvino_get_device_count();
 }
 
 static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_t reg, size_t index) {
@@ -1065,36 +1047,17 @@ static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_
     return ctx->devices[index];
 }
 
-static void * ggml_backend_openvino_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    GGML_UNUSED(reg);
-    GGML_UNUSED(name);
-    return nullptr;
-}
-
 static const struct ggml_backend_reg_i ggml_backend_openvino_reg_interface = {
     /* .get_name         = */ ggml_backend_openvino_reg_get_name,
     /* .get_device_count = */ ggml_backend_openvino_reg_get_device_count,
     /* .get_device       = */ ggml_backend_openvino_reg_get_device,
-    /* .get_proc_address = */ ggml_backend_openvino_get_proc_address,
+    /* .get_proc_address = */ NULL,
 };
 
-static int get_openvino_device_count() {
-    return 1;
-}
-
-static ggml_openvino_device_info ggml_openvino_init() {
+static void ggml_openvino_init() {
     // Initialize device config singleton from env var
     ggml_openvino_init_device_config();
     GGML_LOG_INFO("OpenVINO: using device %s\n", ggml_openvino_get_device_name().c_str());
-
-    ggml_openvino_device_info info = {};
-    info.device_count = get_openvino_device_count();
-    return info;
-}
-
-const ggml_openvino_device_info & ggml_openvino_info() {
-    static ggml_openvino_device_info info = ggml_openvino_init();
-    return info;
 }
 
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void) {
@@ -1105,9 +1068,11 @@ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void) {
         static std::mutex mutex;
         std::lock_guard<std::mutex> lock(mutex);
         if (!initialized) {
+            ggml_openvino_init();
+
             ggml_backend_openvino_reg_context * ctx = new ggml_backend_openvino_reg_context;
 
-            for (int i = 0; i < ggml_openvino_info().device_count; i++) {
+            for (int i = 0; i < ggml_backend_openvino_get_device_count(); i++) {
                 ggml_backend_openvino_device_context * dev_ctx = new ggml_backend_openvino_device_context;
                 dev_ctx->device = i;
                 dev_ctx->name = GGML_OPENVINO_NAME + std::to_string(i);