diff --git a/behave_framework/src/minifi_behave/steps/core_steps.py b/behave_framework/src/minifi_behave/steps/core_steps.py
index f2efef59b5..3613467f24 100644
--- a/behave_framework/src/minifi_behave/steps/core_steps.py
+++ b/behave_framework/src/minifi_behave/steps/core_steps.py
@@ -25,6 +25,7 @@
 import humanfriendly
 from behave import when, step, given
 
+from pathlib import Path
 from minifi_behave.containers.http_proxy_container import HttpProxy
 from minifi_behave.containers.nifi_container import NifiContainer
 from minifi_behave.containers.directory import Directory
@@ -82,6 +83,16 @@ def create_file_with_content_in_directory(context: MinifiTestContext, directory:
     context.execute_steps(f'given a directory at "{directory}" has a file with the content "{content}" in the "{DEFAULT_MINIFI_CONTAINER_NAME}" flow')
 
 
+@step('a directory at "{directory}" has a file with the content from "{path}"')
+@step("a directory at '{directory}' has a file with the content from '{path}'")
+def create_file_with_content_in_directory(context: MinifiTestContext, directory: str, path: str):
+    assert context.resource_dir is not None or "Cannot copy file if resource_dir is not set for the context"
+    content = None
+    with open(context.resource_dir / path, "rb") as f:
+        content = f.read()
+    context.execute_steps(f'given a directory at "{directory}" has a file with the content "{content}" in the "{DEFAULT_MINIFI_CONTAINER_NAME}" flow')
+
+
 @step('a directory at "{directory}" has a file "{file_name}" with the content "{content}"')
 def create_file_with_name_and_content_in_directory(context: MinifiTestContext, directory: str, file_name: str, content: str):
     __add_directory_with_file_to_container(context, directory, file_name, content, DEFAULT_MINIFI_CONTAINER_NAME)
diff --git a/cmake/LlamaCpp.cmake b/cmake/LlamaCpp.cmake
index f78101c6f9..d9e764379c 100644
--- a/cmake/LlamaCpp.cmake
+++ b/cmake/LlamaCpp.cmake
@@ -21,6 +21,7 @@ set(BUILD_SHARED_LIBS "OFF" CACHE STRING "" FORCE)
 set(LLAMA_BUILD_TESTS "OFF" CACHE STRING "" FORCE)
 set(LLAMA_BUILD_EXAMPLES "OFF" CACHE STRING "" FORCE)
 set(LLAMA_BUILD_SERVER "OFF" CACHE STRING "" FORCE)
+set(LLAMA_BUILD_COMMON "ON" CACHE STRING "" FORCE)
 set(GGML_OPENMP "OFF" CACHE STRING "" FORCE)
 set(GGML_METAL "OFF" CACHE STRING "" FORCE)
 set(GGML_BLAS "OFF" CACHE STRING "" FORCE)
@@ -30,24 +31,30 @@ else()
     set(GGML_NATIVE "ON" CACHE STRING "" FORCE)
 endif()
 
-set(PATCH_FILE_1 "${CMAKE_SOURCE_DIR}/thirdparty/llamacpp/lu8_macro_fix.patch")  # https://github.com/ggml-org/llama.cpp/issues/12740
-set(PATCH_FILE_2 "${CMAKE_SOURCE_DIR}/thirdparty/llamacpp/cpp-23-fixes.patch")
+set(PATCH_FILE_1 "${CMAKE_SOURCE_DIR}/thirdparty/llamacpp/mtmd-fix.patch")
 
 set(PC ${Bash_EXECUTABLE}  -c "set -x &&\
-            (\\\"${Patch_EXECUTABLE}\\\" -p1 -R -s -f --dry-run -i \\\"${PATCH_FILE_1}\\\" || \\\"${Patch_EXECUTABLE}\\\" -p1 -N -i \\\"${PATCH_FILE_1}\\\") &&\
-            (\\\"${Patch_EXECUTABLE}\\\" -p1 -R -s -f --dry-run -i \\\"${PATCH_FILE_2}\\\" || \\\"${Patch_EXECUTABLE}\\\" -p1 -N -i \\\"${PATCH_FILE_2}\\\")")
+            (\\\"${Patch_EXECUTABLE}\\\" -p1 -R -s -f --dry-run -i \\\"${PATCH_FILE_1}\\\" || \\\"${Patch_EXECUTABLE}\\\" -p1 -N -i \\\"${PATCH_FILE_1}\\\")")
+
 
 FetchContent_Declare(llamacpp
-        URL https://github.com/ggml-org/llama.cpp/archive/refs/tags/b5958.tar.gz
-        URL_HASH SHA256=4e8a2abd83092aa446cd13556f6fe8777139da7b191bdaa0e1b79fe9740b36a6
-        PATCH_COMMAND "${PC}"
-        SYSTEM
+    URL https://github.com/ggml-org/llama.cpp/archive/refs/tags/b8944.tar.gz
+    URL_HASH SHA256=ca231c8aca086f56bad3ed371f6dc5b01e971e812a8ddf67564f087390c0e781
+    PATCH_COMMAND "${PC}"
+    SYSTEM
 )
 
 FetchContent_MakeAvailable(llamacpp)
 
+if(MSVC AND TARGET llama)
+    target_compile_options(llama PRIVATE /Zc:__cplusplus)
+endif()
+
 set(LLAMACPP_INCLUDE_DIRS
     "${llamacpp_SOURCE_DIR}/include"
     "${llamacpp_SOURCE_DIR}/ggml/include"
+    "${llamacpp_SOURCE_DIR}/tools"
+    "${llamacpp_SOURCE_DIR}/common"
+    "${llamacpp_SOURCE_DIR}/vendor"
     CACHE STRING "" FORCE
 )
diff --git a/extension-framework/cpp-extension-lib/include/api/core/FlowFile.h b/extension-framework/cpp-extension-lib/include/api/core/FlowFile.h
index 833dfdaf73..4a288f36d7 100644
--- a/extension-framework/cpp-extension-lib/include/api/core/FlowFile.h
+++ b/extension-framework/cpp-extension-lib/include/api/core/FlowFile.h
@@ -26,7 +26,11 @@ namespace org::apache::nifi::minifi::api::core {
 struct EnsureMovedFromDeleter {
   void operator()(MinifiFlowFile* ff) {
     if (ff) {
-      throw std::logic_error("Each flowfile should be either transferred or removed");
+      if (std::uncaught_exceptions()) {
+        // there is already an exception in progress, do not terminate the process (although there are scenarios we could throw here)
+      } else {
+        throw std::logic_error("Each flowfile should be either transferred or removed");
+      }
     }
   }
 };
diff --git a/extensions/llamacpp/CMakeLists.txt b/extensions/llamacpp/CMakeLists.txt
index 421143f692..1de6be4589 100644
--- a/extensions/llamacpp/CMakeLists.txt
+++ b/extensions/llamacpp/CMakeLists.txt
@@ -31,7 +31,7 @@ add_minifi_library(minifi-llamacpp SHARED ${SOURCES})
 target_include_directories(minifi-llamacpp PUBLIC "${CMAKE_SOURCE_DIR}/extensions/llamacpp")
 target_include_directories(minifi-llamacpp PUBLIC "${LLAMACPP_INCLUDE_DIRS}")
 
-target_link_libraries(minifi-llamacpp minifi-cpp-extension-lib llama)
+target_link_libraries(minifi-llamacpp minifi-cpp-extension-lib llama mtmd llama-common)
 
 register_c_api_extension(minifi-llamacpp "LLAMACPP EXTENSION" LLAMACPP-EXTENSION "Provides llama.cpp support" "extensions/llamacpp/tests")
 
diff --git a/extensions/llamacpp/processors/DefaultLlamaContext.cpp b/extensions/llamacpp/processors/DefaultLlamaContext.cpp
index 23559fb7c9..d859b139ee 100644
--- a/extensions/llamacpp/processors/DefaultLlamaContext.cpp
+++ b/extensions/llamacpp/processors/DefaultLlamaContext.cpp
@@ -16,8 +16,12 @@
  */
 
 #include "DefaultLlamaContext.h"
+
+#include <range/v3/all.hpp>
+
 #include "minifi-cpp/Exception.h"
 #include "fmt/format.h"
+#include "mtmd/mtmd-helper.h"
 
 namespace org::apache::nifi::minifi::extensions::llamacpp::processors {
 
@@ -36,17 +40,18 @@ std::vector<llama_token> tokenizeInput(const llama_vocab* vocab, const std::stri
   return tokenized_input;
 }
 
-constexpr size_t DEFAULT_BUFFER_SIZE = 4096;
-
 }  // namespace
 
 
-DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path, const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params) {
+DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path, const std::optional<std::filesystem::path>& multimodal_model_path,
+    const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params, const std::shared_ptr<core::logging::Logger>& logger) {
   llama_model_ = llama_model_load_from_file(model_path.string().c_str(), llama_model_default_params());  // NOLINT(cppcoreguidelines-prefer-member-initializer)
   if (!llama_model_) {
     throw Exception(ExceptionType::PROCESS_SCHEDULE_EXCEPTION, fmt::format("Failed to load model from '{}'", model_path.string()));
   }
 
+  chat_template_ = common_chat_templates_init(llama_model_, "");
+
   llama_context_params ctx_params = llama_context_default_params();
   ctx_params.n_ctx = llama_ctx_params.n_ctx;
   ctx_params.n_batch = llama_ctx_params.n_batch;
@@ -54,7 +59,7 @@ DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path
   ctx_params.n_seq_max = llama_ctx_params.n_seq_max;
   ctx_params.n_threads = llama_ctx_params.n_threads;
   ctx_params.n_threads_batch = llama_ctx_params.n_threads_batch;
-  ctx_params.flash_attn = false;
+  ctx_params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
   llama_ctx_ = llama_init_from_model(llama_model_, ctx_params);
 
   auto sparams = llama_sampler_chain_default_params();
@@ -73,9 +78,27 @@ DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path
     llama_sampler_chain_add(llama_sampler_, llama_sampler_init_temp(*llama_sampler_params.temperature));
   }
   llama_sampler_chain_add(llama_sampler_, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+
+  if (!multimodal_model_path) {
+    logger->log_info("No multimodal model path provided");
+    return;
+  }
+
+  mtmd_context_params mparams = mtmd_context_params_default();
+  mparams.use_gpu = false;
+  mparams.flash_attn_type  = LLAMA_FLASH_ATTN_TYPE_DISABLED;
+
+  multimodal_ctx_ = mtmd_init_from_file(multimodal_model_path->string().c_str(), llama_model_, mparams);
+  if (!multimodal_ctx_) {
+    throw Exception(ExceptionType::PROCESS_SCHEDULE_EXCEPTION, fmt::format("Failed to load multimodal model from '{}'", multimodal_model_path->string()));
+  }
+
+  logger->log_info("Successfully loaded multimodal model from '{}'", multimodal_model_path->string());
 }
 
 DefaultLlamaContext::~DefaultLlamaContext() {
+  mtmd_free(multimodal_ctx_);
+  multimodal_ctx_ = nullptr;
   llama_sampler_free(llama_sampler_);
   llama_sampler_ = nullptr;
   llama_free(llama_ctx_);
@@ -85,47 +108,96 @@ DefaultLlamaContext::~DefaultLlamaContext() {
 }
 
 std::optional<std::string> DefaultLlamaContext::applyTemplate(const std::vector<LlamaChatMessage>& messages) {
-  std::vector<llama_chat_message> llama_messages;
-  llama_messages.reserve(messages.size());
-  std::transform(messages.begin(), messages.end(), std::back_inserter(llama_messages),
-                 [](const LlamaChatMessage& msg) { return llama_chat_message{.role = msg.role.c_str(), .content = msg.content.c_str()}; });
-  std::string text;
-  text.resize(DEFAULT_BUFFER_SIZE);
-  const char * chat_template = llama_model_chat_template(llama_model_, nullptr);
-  int32_t res_size = llama_chat_apply_template(chat_template, llama_messages.data(), llama_messages.size(), true, text.data(), gsl::narrow<int32_t>(text.size()));
-  if (res_size < 0) {
+  if (!chat_template_) {
     return std::nullopt;
   }
-  if (res_size > gsl::narrow<int32_t>(text.size())) {
-    text.resize(res_size);
-    res_size = llama_chat_apply_template(chat_template, llama_messages.data(), llama_messages.size(), true, text.data(), gsl::narrow<int32_t>(text.size()));
-    if (res_size < 0) {
-      return std::nullopt;
-    }
+  common_chat_templates_inputs inputs;
+  for (auto& msg : messages) {
+    common_chat_msg chat_msg;
+    chat_msg.role = msg.role;
+    chat_msg.content = msg.content;
+    inputs.messages.push_back(std::move(chat_msg));
   }
-  text.resize(res_size);
+  inputs.enable_thinking = false;  // TODO(adebreceni): MINIFICPP-2800 common_chat_templates_support_enable_thinking(chat_template_.get());
 
-  return text;
+  return common_chat_templates_apply(chat_template_.get(), inputs).prompt;
 }
 
-std::expected<GenerationResult, std::string> DefaultLlamaContext::generate(const std::string& input, std::function<void(std::string_view/*token*/)> token_handler) {
+namespace {
+
+struct mtmd_bitmap_deleter {
+  void operator()(mtmd_bitmap* val) { mtmd_bitmap_free(val); }
+};
+using unique_bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
+
+struct mtmd_input_chunks_deleter {
+  void operator()(mtmd_input_chunks* val) { mtmd_input_chunks_free(val); }
+};
+using unique_mtmd_input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
+
+}  // namespace
+
+std::expected<GenerationResult, std::string> DefaultLlamaContext::generate(const std::string& prompt, const std::vector<std::vector<std::byte>>& files,
+      std::function<void(std::string_view/*token*/)> token_handler) {
   GenerationResult result{};
   auto start_time = std::chrono::steady_clock::now();
+  llama_memory_seq_rm(llama_get_memory(llama_ctx_), 0, -1, -1);
   const llama_vocab * vocab = llama_model_get_vocab(llama_model_);
-  std::vector<llama_token> tokenized_input = tokenizeInput(vocab, input);
-  result.num_tokens_in = gsl::narrow<uint64_t>(tokenized_input.size());
+  llama_pos n_past = 0;
+  std::vector<llama_token> tokenized_input;
+  llama_batch batch = llama_batch_init(1, 0, 1);
+  auto batch_deleter = gsl::finally([&] {llama_batch_free(batch);});
+  batch.n_tokens = 1;
+  batch.n_seq_id[0] = 1;
+  batch.seq_id[0][0] = 0;
+  batch.logits[0] = true;
+  int32_t decode_status = 0;
+  if (multimodal_ctx_) {
+    if (files.empty()) {
+      return std::unexpected{"Multimodal input requires at least one file"};
+    }
+    std::vector<unique_bitmap_ptr> bitmaps;
+    for (auto& file : files) {
+      unique_bitmap_ptr bitmap{mtmd_helper_bitmap_init_from_buf(multimodal_ctx_, reinterpret_cast<const unsigned char*>(file.data()), file.size())};
+      if (!bitmap) {
+        throw Exception(PROCESSOR_EXCEPTION, "Failed to create multimodal bitmap from buffer");
+      }
+      bitmaps.push_back(std::move(bitmap));
+    }
+    mtmd_input_text inp_txt = {
+      .text = prompt.c_str(),
+      .add_special = true,
+      .parse_special = true,
+    };
+    unique_mtmd_input_chunks_ptr chunks{mtmd_input_chunks_init()};
+    auto bitmap_c_ptrs = bitmaps | ranges::views::transform([] (auto& ptr) {return static_cast<const mtmd_bitmap*>(ptr.get());}) | ranges::to<std::vector>();
+    auto tokenized = mtmd_tokenize(multimodal_ctx_, chunks.get(), &inp_txt, bitmap_c_ptrs.data(), bitmap_c_ptrs.size());
+    if (tokenized != 0) {
+      throw Exception(PROCESSOR_EXCEPTION, fmt::format("Failed to tokenize multimodal prompt, error: {}", tokenized));
+    }
+    auto status = mtmd_helper_eval_chunks(multimodal_ctx_, llama_ctx_, chunks.get(), 0, 0, 1, true, &n_past);
+    if (status != 0) {
+      throw Exception(PROCESSOR_EXCEPTION, fmt::format("Failed to eval multimodal chunks, error: {}", status));
+    }
+  } else {
+    if (!files.empty()) {
+      return std::unexpected{"Model is not configured for multimodal input"};
+    }
+    try {
+      tokenized_input = tokenizeInput(vocab, prompt);
+    } catch (std::exception& e) {
+      return std::unexpected{fmt::format("Error during tokenization: {}", e.what())};
+    } catch (...) {
+      return std::unexpected{"Unknown error during tokenization"};
+    }
+    n_past = gsl::narrow<llama_pos>(tokenized_input.size());
+    decode_status = llama_decode(llama_ctx_, llama_batch_get_one(tokenized_input.data(), n_past));
+  }
+  result.num_tokens_in = gsl::narrow<uint64_t>(n_past);
 
-  llama_batch batch = llama_batch_get_one(tokenized_input.data(), gsl::narrow<int32_t>(tokenized_input.size()));
   llama_token new_token_id = 0;
   bool first_token_generated = false;
-  while (true) {
-    int32_t res = llama_decode(llama_ctx_, batch);
-    if (res == 1) {
-      return std::unexpected{"Could not find a KV slot for the batch (try reducing the size of the batch or increase the context)"};
-    } else if (res < 0) {
-      return std::unexpected{"Error occurred while executing llama decode"};
-    }
-
+  while (decode_status == 0) {
     new_token_id = llama_sampler_sample(llama_sampler_, llama_ctx_, -1);
     if (!first_token_generated) {
       result.time_to_first_token = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start_time);
@@ -147,8 +219,22 @@ std::expected<GenerationResult, std::string> DefaultLlamaContext::generate(const
     gsl_Assert(len < 128);
 
     std::string_view token_str{buf.data(), gsl::narrow<std::string_view::size_type>(len)};
-    batch = llama_batch_get_one(&new_token_id, 1);
+    batch.token[0] = new_token_id;
+    batch.pos[0] = n_past;
+    ++n_past;
     token_handler(token_str);
+
+    decode_status = llama_decode(llama_ctx_, batch);
+  }
+
+  if (decode_status == 1) {
+    return std::unexpected("Could not find a KV slot for the batch (try reducing the size of the batch or increase the context)");
+  }
+  if (decode_status == 2) {
+    return std::unexpected("Llama decode aborted");
+  }
+  if (decode_status < 0) {
+    return std::unexpected("Error occurred while executing llama decode");
   }
 
   result.tokens_per_second =
diff --git a/extensions/llamacpp/processors/DefaultLlamaContext.h b/extensions/llamacpp/processors/DefaultLlamaContext.h
index 94c3346b2f..279187b404 100644
--- a/extensions/llamacpp/processors/DefaultLlamaContext.h
+++ b/extensions/llamacpp/processors/DefaultLlamaContext.h
@@ -19,12 +19,16 @@
 #include "LlamaContext.h"
 #include "llama.h"
 #include "LlamaBackendInitializer.h"
+#include "chat.h"
+#include "mtmd/mtmd.h"
+#include "minifi-cpp/core/logging/Logger.h"
 
 namespace org::apache::nifi::minifi::extensions::llamacpp::processors {
 
 class DefaultLlamaContext : public LlamaContext {
  public:
-  DefaultLlamaContext(const std::filesystem::path& model_path, const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params);
+  DefaultLlamaContext(const std::filesystem::path& model_path, const std::optional<std::filesystem::path>& multimodal_model_path,
+      const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params, const std::shared_ptr<core::logging::Logger>& logger);
   DefaultLlamaContext(const DefaultLlamaContext&) = delete;
   DefaultLlamaContext(DefaultLlamaContext&&) = delete;
   DefaultLlamaContext& operator=(const DefaultLlamaContext&) = delete;
@@ -32,12 +36,15 @@ class DefaultLlamaContext : public LlamaContext {
   ~DefaultLlamaContext() override;
 
   std::optional<std::string> applyTemplate(const std::vector<LlamaChatMessage>& messages) override;
-  std::expected<GenerationResult, std::string> generate(const std::string& input, std::function<void(std::string_view/*token*/)> token_handler) override;
+  std::expected<GenerationResult, std::string> generate(const std::string& prompt, const std::vector<std::vector<std::byte>>& files,
+      std::function<void(std::string_view/*token*/)> token_handler) override;
 
  private:
   const LlamaBackendInitializer& llama_context_initializer_ = LlamaBackendInitializer::get();
   llama_model* llama_model_{};
+  common_chat_templates_ptr chat_template_;
   llama_context* llama_ctx_{};
+  mtmd_context* multimodal_ctx_{};
   llama_sampler* llama_sampler_{};
 };
 
diff --git a/extensions/llamacpp/processors/LlamaContext.h b/extensions/llamacpp/processors/LlamaContext.h
index a7cd2eb44e..557b270268 100644
--- a/extensions/llamacpp/processors/LlamaContext.h
+++ b/extensions/llamacpp/processors/LlamaContext.h
@@ -59,7 +59,8 @@ struct GenerationResult {
 class LlamaContext {
  public:
   virtual std::optional<std::string> applyTemplate(const std::vector<LlamaChatMessage>& messages) = 0;
-  virtual std::expected<GenerationResult, std::string> generate(const std::string& input, std::function<void(std::string_view/*token*/)> token_handler) = 0;
+  virtual std::expected<GenerationResult, std::string> generate(const std::string& input, const std::vector<std::vector<std::byte>>& files,
+      std::function<void(std::string_view/*token*/)> token_handler) = 0;
   virtual ~LlamaContext() = default;
 };
 
diff --git a/extensions/llamacpp/processors/RunLlamaCppInference.cpp b/extensions/llamacpp/processors/RunLlamaCppInference.cpp
index 5927a0199e..ede77cd062 100644
--- a/extensions/llamacpp/processors/RunLlamaCppInference.cpp
+++ b/extensions/llamacpp/processors/RunLlamaCppInference.cpp
@@ -31,7 +31,9 @@ namespace org::apache::nifi::minifi::extensions::llamacpp::processors {
 MinifiStatus RunLlamaCppInference::onScheduleImpl(api::core::ProcessContext& context) {
   model_path_.clear();
   model_path_ = api::utils::parseProperty(context, ModelPath);
+  multimodal_model_path_ = api::utils::parseOptionalProperty(context, MultiModalModelPath);
   system_prompt_ = context.getProperty(SystemPrompt).value_or("");
+  output_attribute_ = api::utils::parseOptionalProperty(context, OutputAttributeName);
 
   LlamaSamplerParams llama_sampler_params;
   llama_sampler_params.temperature = api::utils::parseOptionalFloatProperty(context, Temperature);
@@ -51,9 +53,9 @@ MinifiStatus RunLlamaCppInference::onScheduleImpl(api::core::ProcessContext& con
   llama_ctx_params.n_threads_batch = gsl::narrow<int32_t>(api::utils::parseI64Property(context, ThreadsForBatchProcessing));
 
   if (llama_context_provider_) {
-    llama_ctx_ = llama_context_provider_(model_path_, llama_sampler_params, llama_ctx_params);
+    llama_ctx_ = llama_context_provider_(model_path_, multimodal_model_path_, llama_sampler_params, llama_ctx_params);
   } else {
-    llama_ctx_ = std::make_unique<DefaultLlamaContext>(model_path_, llama_sampler_params, llama_ctx_params);
+    llama_ctx_ = std::make_unique<DefaultLlamaContext>(model_path_, multimodal_model_path_, llama_sampler_params, llama_ctx_params, logger_);
   }
 
   return MINIFI_STATUS_SUCCESS;
@@ -76,10 +78,16 @@ MinifiStatus RunLlamaCppInference::onTriggerImpl(api::core::ProcessContext& cont
   auto prompt = context.getProperty(Prompt, &flow_file).value_or("");
 
   auto read_result = session.readBuffer(flow_file);
+  std::vector<std::vector<std::byte>> files;
   std::string input_data_and_prompt;
   if (!read_result.empty()) {
     input_data_and_prompt.append("Input data (or flow file content):\n");
-    input_data_and_prompt.append({reinterpret_cast<const char*>(read_result.data()), read_result.size()});
+    if (multimodal_model_path_) {
+      input_data_and_prompt.append(mtmd_default_marker());
+      files.push_back(std::move(read_result));
+    } else {
+      input_data_and_prompt.append({reinterpret_cast<const char*>(read_result.data()), read_result.size()});
+    }
     input_data_and_prompt.append("\n\n");
   }
   input_data_and_prompt.append(prompt);
@@ -111,7 +119,7 @@ MinifiStatus RunLlamaCppInference::onTriggerImpl(api::core::ProcessContext& cont
   auto start_time = std::chrono::steady_clock::now();
 
   std::string text;
-  auto generation_result = llama_ctx_->generate(*input, [&] (std::string_view token) {
+  auto generation_result = llama_ctx_->generate(*input, files, [&] (std::string_view token) {
     text += token;
   });
 
@@ -133,7 +141,12 @@ MinifiStatus RunLlamaCppInference::onTriggerImpl(api::core::ProcessContext& cont
   session.setAttribute(flow_file, LlamaCppTimeToFirstToken.name, std::to_string(generation_result->time_to_first_token.count()) + " ms");
   session.setAttribute(flow_file, LlamaCppTokensPerSecond.name, fmt::format("{:.2f}", generation_result->tokens_per_second));
 
-  session.writeBuffer(flow_file, text);
+  if (output_attribute_) {
+    session.setAttribute(flow_file, output_attribute_.value(), text);
+  } else {
+    session.writeBuffer(flow_file, text);
+  }
+
   session.transfer(std::move(flow_file), Success);
 
   return MINIFI_STATUS_SUCCESS;
diff --git a/extensions/llamacpp/processors/RunLlamaCppInference.h b/extensions/llamacpp/processors/RunLlamaCppInference.h
index 76ae3a1f65..21017db0e9 100644
--- a/extensions/llamacpp/processors/RunLlamaCppInference.h
+++ b/extensions/llamacpp/processors/RunLlamaCppInference.h
@@ -29,7 +29,7 @@
 namespace org::apache::nifi::minifi::extensions::llamacpp::processors {
 
 using LlamaContextProvider =
-  std::function<std::unique_ptr<LlamaContext>(const std::filesystem::path& model_path, const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params)>;
+  std::function<std::unique_ptr<LlamaContext>(const std::filesystem::path& model_path, const std::optional<std::filesystem::path>& multimodal_model_path, const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params)>;
 
 class RunLlamaCppInferenceMetrics {
  public:
@@ -58,6 +58,15 @@ class RunLlamaCppInference : public api::core::ProcessorImpl {
   EXTENSIONAPI static constexpr auto ModelPath = core::PropertyDefinitionBuilder<>::createProperty("Model Path")
       .withDescription("The filesystem path of the model file in gguf format.")
       .isRequired(true)
+      .withValidator(core::StandardPropertyValidators::NON_BLANK_VALIDATOR)
+      .build();
+  EXTENSIONAPI static constexpr auto OutputAttributeName = core::PropertyDefinitionBuilder<>::createProperty("Output Attribute Name")
+      .withDescription("Specify the attribute to use as output, if not provided, the content is overridden instead.")
+      .withValidator(core::StandardPropertyValidators::NON_BLANK_VALIDATOR)
+      .build();
+  EXTENSIONAPI static constexpr auto MultiModalModelPath = core::PropertyDefinitionBuilder<>::createProperty("MultiModal Model Path")
+      .withDescription("The filesystem path of the multimodal model (visual, audio) file in gguf format.")
+      .withValidator(core::StandardPropertyValidators::NON_BLANK_VALIDATOR)
       .build();
   EXTENSIONAPI static constexpr auto Temperature = core::PropertyDefinitionBuilder<>::createProperty("Temperature")
       .withDescription("The temperature to use for sampling.")
@@ -128,6 +137,8 @@ class RunLlamaCppInference : public api::core::ProcessorImpl {
 
   EXTENSIONAPI static constexpr auto Properties = std::to_array<core::PropertyReference>({
     ModelPath,
+    OutputAttributeName,
+    MultiModalModelPath,
     Temperature,
     TopK,
     TopP,
@@ -167,7 +178,9 @@ class RunLlamaCppInference : public api::core::ProcessorImpl {
   void increaseTokensOut(uint64_t token_count);
 
   std::string model_path_;
+  std::optional<std::string> multimodal_model_path_;
   std::string system_prompt_;
+  std::optional<std::string> output_attribute_;
 
   LlamaContextProvider llama_context_provider_;
   std::unique_ptr<LlamaContext> llama_ctx_;
diff --git a/extensions/llamacpp/tests/RunLlamaCppInferenceTests.cpp b/extensions/llamacpp/tests/RunLlamaCppInferenceTests.cpp
index 9fa893fba3..2b97976fa3 100644
--- a/extensions/llamacpp/tests/RunLlamaCppInferenceTests.cpp
+++ b/extensions/llamacpp/tests/RunLlamaCppInferenceTests.cpp
@@ -37,10 +37,16 @@ class MockLlamaContext : public processors::LlamaContext {
     return "Test input";
   }
 
-  std::expected<processors::GenerationResult, std::string> generate(const std::string& input, std::function<void(std::string_view/*token*/)> token_handler) override {
+  std::expected<processors::GenerationResult, std::string> generate(const std::string& input, const std::vector<std::vector<std::byte>>& files,
+        std::function<void(std::string_view/*token*/)> token_handler) override {
     if (fail_generation_) {
       return std::unexpected{"Generation failed"};
     }
+    if (multimodal_) {
+      if (files.empty()) {
+        return std::unexpected{"Files empty"};
+      }
+    }
     processors::GenerationResult result;
     input_ = input;
     token_handler("Test ");
@@ -69,7 +75,12 @@ class MockLlamaContext : public processors::LlamaContext {
     fail_apply_template_ = true;
   }
 
+  void setMultimodal() {
+    multimodal_ = true;
+  }
+
  private:
+  bool multimodal_{false};
   bool fail_generation_{false};
   bool fail_apply_template_{false};
   std::vector<processors::LlamaChatMessage> messages_;
@@ -84,7 +95,7 @@ TEST_CASE("Prompt is generated correctly with default parameters") {
   processors::LlamaContextParams test_context_params;
   minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor<processors::RunLlamaCppInference>(
     core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory<processors::RunLlamaCppInference>::getLogger()},
-    [&](const std::filesystem::path& model_path, const processors::LlamaSamplerParams& sampler_params, const processors::LlamaContextParams& context_params) {
+    [&](const std::filesystem::path& model_path, const std::optional<std::filesystem::path>&, const processors::LlamaSamplerParams& sampler_params, const processors::LlamaContextParams& context_params) {
       test_model_path = model_path;
       test_sampler_params = sampler_params;
       test_context_params = context_params;
@@ -130,7 +141,7 @@ TEST_CASE("Prompt is generated correctly with custom parameters") {
   processors::LlamaContextParams test_context_params;
   minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor<processors::RunLlamaCppInference>(
     core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory<processors::RunLlamaCppInference>::getLogger()},
-    [&](const std::filesystem::path& model_path, const processors::LlamaSamplerParams& sampler_params, const processors::LlamaContextParams& context_params) {
+    [&](const std::filesystem::path& model_path, const std::optional<std::filesystem::path>&, const processors::LlamaSamplerParams& sampler_params, const processors::LlamaContextParams& context_params) {
       test_model_path = model_path;
       test_sampler_params = sampler_params;
       test_context_params = context_params;
@@ -182,7 +193,7 @@ TEST_CASE("Empty flow file does not include input data in prompt") {
   auto mock_llama_context_ptr = mock_llama_context.get();
   minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor<processors::RunLlamaCppInference>(
     core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory<processors::RunLlamaCppInference>::getLogger()},
-    [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) {
+    [&](const std::filesystem::path&, const std::optional<std::filesystem::path>&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) {
       return std::move(mock_llama_context);
     }));
   LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
@@ -206,7 +217,7 @@ TEST_CASE("Empty flow file does not include input data in prompt") {
 TEST_CASE("Invalid values for optional double type properties throw exception") {
   minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor<processors::RunLlamaCppInference>(
     core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory<processors::RunLlamaCppInference>::getLogger()},
-    [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) {
+    [&](const std::filesystem::path&, const std::optional<std::filesystem::path>&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) {
       return std::make_unique<MockLlamaContext>();
     }));
   LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
@@ -236,7 +247,7 @@ TEST_CASE("Top K property empty and invalid values are handled properly") {
   std::optional<int32_t> test_top_k = 0;
   minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor<processors::RunLlamaCppInference>(
     core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory<processors::RunLlamaCppInference>::getLogger()},
-    [&](const std::filesystem::path&, const processors::LlamaSamplerParams& sampler_params, const processors::LlamaContextParams&) {
+    [&](const std::filesystem::path&, const std::optional<std::filesystem::path>&, const processors::LlamaSamplerParams& sampler_params, const processors::LlamaContextParams&) {
       test_top_k = sampler_params.top_k;
       return std::make_unique<MockLlamaContext>();
     }));
@@ -269,7 +280,7 @@ TEST_CASE("Error handling during generation and applying template") {
 
   minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor<processors::RunLlamaCppInference>(
     core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory<processors::RunLlamaCppInference>::getLogger()},
-    [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) {
+    [&](const std::filesystem::path&, const std::optional<std::filesystem::path>&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) {
       return std::move(mock_llama_context);
     }));
   LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
@@ -287,7 +298,7 @@ TEST_CASE("Error handling during generation and applying template") {
 TEST_CASE("Route flow file to failure when prompt and input data is empty") {
   minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor<processors::RunLlamaCppInference>(
     core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory<processors::RunLlamaCppInference>::getLogger()},
-    [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) {
+    [&](const std::filesystem::path&, const std::optional<std::filesystem::path>&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) {
       return std::make_unique<MockLlamaContext>();
     }));
   LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
@@ -307,7 +318,7 @@ TEST_CASE("System prompt is optional") {
   auto mock_llama_context_ptr = mock_llama_context.get();
   minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor<processors::RunLlamaCppInference>(
     core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory<processors::RunLlamaCppInference>::getLogger()},
-    [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) {
+    [&](const std::filesystem::path&, const std::optional<std::filesystem::path>&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) {
       return std::move(mock_llama_context);
     }));
   LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
@@ -329,7 +340,7 @@ TEST_CASE("System prompt is optional") {
 TEST_CASE("Test output metrics") {
   auto processor = minifi::test::utils::make_custom_c_processor<processors::RunLlamaCppInference>(
     core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory<processors::RunLlamaCppInference>::getLogger()},
-    [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) {
+    [&](const std::filesystem::path&, const std::optional<std::filesystem::path>&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) {
       return std::make_unique<MockLlamaContext>();
     });
   auto processor_metrics = processor->getMetrics();
@@ -357,4 +368,63 @@ TEST_CASE("Test output metrics") {
   CHECK(c2_metrics[0].children[c2_metrics[0].children.size() - 1].value.to_string() == "6");
 }
 
+TEST_CASE("Prompt is generated correctly for multimodal inference") {
+  auto mock_llama_context = std::make_unique<MockLlamaContext>();
+  auto mock_llama_context_ptr = mock_llama_context.get();
+  std::filesystem::path test_model_path;
+  std::optional<std::filesystem::path> test_model_path_option;
+  minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor<processors::RunLlamaCppInference>(
+    core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory<processors::RunLlamaCppInference>::getLogger()},
+    [&](const std::filesystem::path& model_path, const std::optional<std::filesystem::path>& multimodal_model_path, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) {
+      test_model_path = model_path;
+      test_model_path_option = multimodal_model_path;
+      if (multimodal_model_path) {
+        mock_llama_context->setMultimodal();
+      }
+      return std::move(mock_llama_context);
+    }));
+  LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
+  REQUIRE(controller.getProcessor()->setProperty(processors::RunLlamaCppInference::ModelPath.name, "/path/to/model"));
+  REQUIRE(controller.getProcessor()->setProperty(processors::RunLlamaCppInference::MultiModalModelPath.name, "/path/to/mm-model"));
+  REQUIRE(controller.getProcessor()->setProperty(processors::RunLlamaCppInference::Prompt.name, "What is on the image?"));
+
+
+  SECTION("Flowfile contains data") {
+    auto results = controller.trigger(minifi::test::InputFlowFileData{.content = "<some image data>", .attributes = {}});
+    CHECK(test_model_path == "/path/to/model");
+    CHECK(test_model_path_option == "/path/to/mm-model");
+    REQUIRE(results.at(processors::RunLlamaCppInference::Success).size() == 1);
+    auto& output_flow_file = results.at(processors::RunLlamaCppInference::Success)[0];
+    CHECK(controller.plan->getContent(output_flow_file) == "Test generated content");
+    CHECK(mock_llama_context_ptr->getInput() == "Test input");
+    REQUIRE(mock_llama_context_ptr->getMessages().size() == 2);
+    CHECK(mock_llama_context_ptr->getMessages()[1].role == "user");
+    CHECK(mock_llama_context_ptr->getMessages()[1].content == "Input data (or flow file content):\n<__media__>\n\nWhat is on the image?");
+  }
+
+  SECTION("Flowfile is empty") {
+    auto results = controller.trigger(minifi::test::InputFlowFileData{.content = "", .attributes = {}});
+    REQUIRE(results.at(processors::RunLlamaCppInference::Failure).size() == 1);
+  }
+}
+
+TEST_CASE("Can write content to attribute") {
+  minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor<processors::RunLlamaCppInference>(
+    core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory<processors::RunLlamaCppInference>::getLogger()},
+    [&](const std::filesystem::path&, const std::optional<std::filesystem::path>&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) {
+      return std::make_unique<MockLlamaContext>();
+    }));
+  LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
+  REQUIRE(controller.getProcessor()->setProperty(processors::RunLlamaCppInference::ModelPath.name, "/path/to/model"));
+  REQUIRE(controller.getProcessor()->setProperty(processors::RunLlamaCppInference::Prompt.name, "What is love?"));
+  REQUIRE(controller.getProcessor()->setProperty(processors::RunLlamaCppInference::OutputAttributeName.name, "DontHurtMe"));
+
+
+  auto results = controller.trigger(minifi::test::InputFlowFileData{.content = "Some content", .attributes = {}});
+  REQUIRE(results.at(processors::RunLlamaCppInference::Success).size() == 1);
+  auto& output_flow_file = results.at(processors::RunLlamaCppInference::Success)[0];
+  CHECK(controller.plan->getContent(output_flow_file) == "Some content");
+  CHECK(output_flow_file->getAttribute("DontHurtMe") == "Test generated content");
+}
+
 }  // namespace org::apache::nifi::minifi::extensions::llamacpp::test
diff --git a/extensions/llamacpp/tests/features/environment.py b/extensions/llamacpp/tests/features/environment.py
index 9f23b76564..bcef201bf4 100644
--- a/extensions/llamacpp/tests/features/environment.py
+++ b/extensions/llamacpp/tests/features/environment.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 from textwrap import dedent
 
+from pathlib import Path
 from minifi_behave.containers.docker_image_builder import DockerImageBuilder
 from minifi_behave.core.hooks import common_before_scenario
 from minifi_behave.core.hooks import common_after_scenario
@@ -29,7 +30,9 @@ def before_all(context: MinifiTestContext):
 
     dockerfile = dedent("""\
                 FROM {base_image}
-                RUN mkdir {models_path} && wget https://huggingface.co/bartowski/Qwen2-0.5B-Instruct-GGUF/resolve/main/Qwen2-0.5B-Instruct-IQ3_M.gguf --directory-prefix={models_path}
+                RUN mkdir {models_path}
+                RUN wget https://huggingface.co/bartowski/Qwen2-VL-2B-Instruct-GGUF/resolve/main/Qwen2-VL-2B-Instruct-Q3_K_M.gguf --directory-prefix={models_path}
+                RUN wget https://huggingface.co/bartowski/Qwen2-VL-2B-Instruct-GGUF/resolve/main/mmproj-Qwen2-VL-2B-Instruct-f16.gguf --directory-prefix={models_path}
         """.format(base_image=minifi_container_image, models_path='/tmp/models'))
 
     builder = DockerImageBuilder(
@@ -42,6 +45,7 @@ def before_all(context: MinifiTestContext):
 def before_scenario(context: MinifiTestContext, scenario):
     context.minifi_container_image = "apacheminificpp:llama"
     common_before_scenario(context, scenario)
+    context.resource_dir = Path(__file__).resolve().parent / "resources"
 
 
 def after_scenario(context, scenario):
diff --git a/extensions/llamacpp/tests/features/llamacpp.feature b/extensions/llamacpp/tests/features/llamacpp.feature
index 437461c08f..b905ab4e69 100644
--- a/extensions/llamacpp/tests/features/llamacpp.feature
+++ b/extensions/llamacpp/tests/features/llamacpp.feature
@@ -18,11 +18,27 @@ Feature: Run language model inference using LlamaCpp processor
 
   Scenario: Test inference with a small model
     Given a GenerateFlowFile processor with the "File Size" property set to "0B"
-    And a RunLlamaCppInference processor with the "Model Path" property set to "/tmp/models/Qwen2-0.5B-Instruct-IQ3_M.gguf"
+    And a RunLlamaCppInference processor with the "Model Path" property set to "/tmp/models/Qwen2-VL-2B-Instruct-Q3_K_M.gguf"
     And the "Prompt" property of the RunLlamaCppInference processor is set to "Repeat after me: banana banana banana"
+    And the "Temperature" property of the RunLlamaCppInference processor is set to "0"
     And a LogAttribute processor with the "Log Payload" property set to "true"
     And the "success" relationship of the GenerateFlowFile processor is connected to the RunLlamaCppInference
     And the "success" relationship of the RunLlamaCppInference processor is connected to the LogAttribute
 
     When all instances start up
     Then the Minifi logs contain the following message: "banana" in less than 60 seconds
+
+  Scenario: Test multimodal inference with a small model
+    Given a GetFile processor with the "Input Directory" property set to "/tmp/input"
+    And a directory at "/tmp/input" has a file with the content from "test-image.png"
+    And a RunLlamaCppInference processor with the "Model Path" property set to "/tmp/models/Qwen2-VL-2B-Instruct-Q3_K_M.gguf"
+    And the "Prompt" property of the RunLlamaCppInference processor is set to "Output only what is written on the image."
+    And the "MultiModal Model Path" property of the RunLlamaCppInference processor is set to "/tmp/models/mmproj-Qwen2-VL-2B-Instruct-f16.gguf"
+    And the "Temperature" property of the RunLlamaCppInference processor is set to "0"
+    And a PutFile processor with the "Directory" property set to "/tmp/output"
+    And the "success" relationship of the GetFile processor is connected to the RunLlamaCppInference
+    And the "success" relationship of the RunLlamaCppInference processor is connected to the PutFile
+
+    When all instances start up
+    Then a single file with the content "minifi" is placed in the "/tmp/output" directory in less than 60 seconds
+
diff --git a/extensions/llamacpp/tests/features/resources/test-image.png b/extensions/llamacpp/tests/features/resources/test-image.png
new file mode 100644
index 0000000000..f6d7720d5a
Binary files /dev/null and b/extensions/llamacpp/tests/features/resources/test-image.png differ
diff --git a/thirdparty/llamacpp/cpp-23-fixes.patch b/thirdparty/llamacpp/cpp-23-fixes.patch
deleted file mode 100644
index 0e84e43956..0000000000
--- a/thirdparty/llamacpp/cpp-23-fixes.patch
+++ /dev/null
@@ -1,24 +0,0 @@
-From 072bd8ce7e10a0fffb1e2bc755c2964e472909ed Mon Sep 17 00:00:00 2001
-From: Martin Zink <martinzink@apache.org>
-Date: Tue, 22 Jul 2025 12:49:42 +0200
-Subject: [PATCH] c++23 fixes
-
----
- src/llama-hparams.cpp | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index c6c67d26..db36de4d 100644
---- a/src/llama-hparams.cpp
-+++ b/src/llama-hparams.cpp
-@@ -1,5 +1,7 @@
- #include "llama-hparams.h"
- 
-+#include <algorithm>
-+
- #include "ggml.h"
- 
- void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
--- 
-2.39.5 (Apple Git-154)
-
diff --git a/thirdparty/llamacpp/lu8_macro_fix.patch b/thirdparty/llamacpp/lu8_macro_fix.patch
deleted file mode 100644
index a1b92d28b3..0000000000
--- a/thirdparty/llamacpp/lu8_macro_fix.patch
+++ /dev/null
@@ -1,17 +0,0 @@
-diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
-index dd27a381..47550954 100644
---- a/src/llama-chat.cpp
-+++ b/src/llama-chat.cpp
-@@ -6,11 +6,7 @@
- #include <sstream>
- #include <algorithm>
- 
--#if __cplusplus >= 202000L
--    #define LU8(x) (const char*)(u8##x)
--#else
--    #define LU8(x) u8##x
--#endif
-+#define LU8(x) reinterpret_cast<const char*>(u8##x)
- 
- // trim whitespace from the beginning and end of a string
- static std::string trim(const std::string & str) {
diff --git a/thirdparty/llamacpp/mtmd-fix.patch b/thirdparty/llamacpp/mtmd-fix.patch
new file mode 100644
index 0000000000..6ac631d4e5
--- /dev/null
+++ b/thirdparty/llamacpp/mtmd-fix.patch
@@ -0,0 +1,44 @@
+diff --color=auto -rupN llama.cpp-b8944/CMakeLists.txt llama.cpp-b8944-patched/CMakeLists.txt
+--- llama.cpp-b8944/CMakeLists.txt	2026-04-27 08:30:55
++++ llama.cpp-b8944-patched/CMakeLists.txt	2026-04-27 13:49:25
+@@ -191,6 +191,7 @@ add_subdirectory(src)
+ #
+ 
+ add_subdirectory(src)
++add_subdirectory(tools/mtmd)
+ 
+ #
+ # utils, programs, examples and tests
+diff --color=auto -rupN llama.cpp-b8944/common/ngram-mod.cpp llama.cpp-b8944-patched/common/ngram-mod.cpp
+--- llama.cpp-b8944/common/ngram-mod.cpp	2026-04-27 08:30:55
++++ llama.cpp-b8944-patched/common/ngram-mod.cpp	2026-04-30 08:28:08
+@@ -1,4 +1,5 @@
+ #include "ngram-mod.h"
++#include <algorithm>
+ 
+ //
+ // common_ngram_mod
+diff --color=auto -rupN llama.cpp-b8944/tools/mtmd/CMakeLists.txt llama.cpp-b8944-patched/tools/mtmd/CMakeLists.txt
+--- llama.cpp-b8944/tools/mtmd/CMakeLists.txt	2026-04-27 08:30:55
++++ llama.cpp-b8944-patched/tools/mtmd/CMakeLists.txt	2026-04-27 13:50:45
+@@ -101,20 +101,6 @@ endif()
+     endif()
+ endif()
+ 
+-add_executable(llama-llava-cli    deprecation-warning.cpp)
+-add_executable(llama-gemma3-cli   deprecation-warning.cpp)
+-add_executable(llama-minicpmv-cli deprecation-warning.cpp)
+-add_executable(llama-qwen2vl-cli  deprecation-warning.cpp)
+-
+-set(TARGET llama-mtmd-cli)
+-add_executable         (${TARGET} mtmd-cli.cpp)
+-set_target_properties  (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
+-if(LLAMA_TOOLS_INSTALL)
+-    install(TARGETS ${TARGET} RUNTIME)
+-endif()
+-target_link_libraries  (${TARGET} PRIVATE llama-common mtmd Threads::Threads)
+-target_compile_features(${TARGET} PRIVATE cxx_std_17)
+-
+ # mtmd-debug tool
+ add_executable(llama-mtmd-debug debug/mtmd-debug.cpp)
+ set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug)