diff --git a/behave_framework/src/minifi_behave/steps/core_steps.py b/behave_framework/src/minifi_behave/steps/core_steps.py index f2efef59b5..3613467f24 100644 --- a/behave_framework/src/minifi_behave/steps/core_steps.py +++ b/behave_framework/src/minifi_behave/steps/core_steps.py @@ -25,6 +25,7 @@ import humanfriendly from behave import when, step, given +from pathlib import Path from minifi_behave.containers.http_proxy_container import HttpProxy from minifi_behave.containers.nifi_container import NifiContainer from minifi_behave.containers.directory import Directory @@ -82,6 +83,16 @@ def create_file_with_content_in_directory(context: MinifiTestContext, directory: context.execute_steps(f'given a directory at "{directory}" has a file with the content "{content}" in the "{DEFAULT_MINIFI_CONTAINER_NAME}" flow') +@step('a directory at "{directory}" has a file with the content from "{path}"') +@step("a directory at '{directory}' has a file with the content from '{path}'") +def create_file_with_content_in_directory(context: MinifiTestContext, directory: str, path: str): + assert context.resource_dir is not None or "Cannot copy file if resource_dir is not set for the context" + content = None + with open(context.resource_dir / path, "rb") as f: + content = f.read() + context.execute_steps(f'given a directory at "{directory}" has a file with the content "{content}" in the "{DEFAULT_MINIFI_CONTAINER_NAME}" flow') + + @step('a directory at "{directory}" has a file "{file_name}" with the content "{content}"') def create_file_with_name_and_content_in_directory(context: MinifiTestContext, directory: str, file_name: str, content: str): __add_directory_with_file_to_container(context, directory, file_name, content, DEFAULT_MINIFI_CONTAINER_NAME) diff --git a/cmake/LlamaCpp.cmake b/cmake/LlamaCpp.cmake index f78101c6f9..d9e764379c 100644 --- a/cmake/LlamaCpp.cmake +++ b/cmake/LlamaCpp.cmake @@ -21,6 +21,7 @@ set(BUILD_SHARED_LIBS "OFF" CACHE STRING "" FORCE) set(LLAMA_BUILD_TESTS "OFF" CACHE STRING "" FORCE) set(LLAMA_BUILD_EXAMPLES "OFF" CACHE STRING "" FORCE) set(LLAMA_BUILD_SERVER "OFF" CACHE STRING "" FORCE) +set(LLAMA_BUILD_COMMON "ON" CACHE STRING "" FORCE) set(GGML_OPENMP "OFF" CACHE STRING "" FORCE) set(GGML_METAL "OFF" CACHE STRING "" FORCE) set(GGML_BLAS "OFF" CACHE STRING "" FORCE) @@ -30,24 +31,30 @@ else() set(GGML_NATIVE "ON" CACHE STRING "" FORCE) endif() -set(PATCH_FILE_1 "${CMAKE_SOURCE_DIR}/thirdparty/llamacpp/lu8_macro_fix.patch") # https://github.com/ggml-org/llama.cpp/issues/12740 -set(PATCH_FILE_2 "${CMAKE_SOURCE_DIR}/thirdparty/llamacpp/cpp-23-fixes.patch") +set(PATCH_FILE_1 "${CMAKE_SOURCE_DIR}/thirdparty/llamacpp/mtmd-fix.patch") set(PC ${Bash_EXECUTABLE} -c "set -x &&\ - (\\\"${Patch_EXECUTABLE}\\\" -p1 -R -s -f --dry-run -i \\\"${PATCH_FILE_1}\\\" || \\\"${Patch_EXECUTABLE}\\\" -p1 -N -i \\\"${PATCH_FILE_1}\\\") &&\ - (\\\"${Patch_EXECUTABLE}\\\" -p1 -R -s -f --dry-run -i \\\"${PATCH_FILE_2}\\\" || \\\"${Patch_EXECUTABLE}\\\" -p1 -N -i \\\"${PATCH_FILE_2}\\\")") + (\\\"${Patch_EXECUTABLE}\\\" -p1 -R -s -f --dry-run -i \\\"${PATCH_FILE_1}\\\" || \\\"${Patch_EXECUTABLE}\\\" -p1 -N -i \\\"${PATCH_FILE_1}\\\")") + FetchContent_Declare(llamacpp - URL https://github.com/ggml-org/llama.cpp/archive/refs/tags/b5958.tar.gz - URL_HASH SHA256=4e8a2abd83092aa446cd13556f6fe8777139da7b191bdaa0e1b79fe9740b36a6 - PATCH_COMMAND "${PC}" - SYSTEM + URL https://github.com/ggml-org/llama.cpp/archive/refs/tags/b8944.tar.gz + URL_HASH SHA256=ca231c8aca086f56bad3ed371f6dc5b01e971e812a8ddf67564f087390c0e781 + PATCH_COMMAND "${PC}" + SYSTEM ) FetchContent_MakeAvailable(llamacpp) +if(MSVC AND TARGET llama) + target_compile_options(llama PRIVATE /Zc:__cplusplus) +endif() + set(LLAMACPP_INCLUDE_DIRS "${llamacpp_SOURCE_DIR}/include" "${llamacpp_SOURCE_DIR}/ggml/include" + "${llamacpp_SOURCE_DIR}/tools" + "${llamacpp_SOURCE_DIR}/common" + "${llamacpp_SOURCE_DIR}/vendor" CACHE STRING "" FORCE ) diff --git a/extension-framework/cpp-extension-lib/include/api/core/FlowFile.h b/extension-framework/cpp-extension-lib/include/api/core/FlowFile.h index 833dfdaf73..4a288f36d7 100644 --- a/extension-framework/cpp-extension-lib/include/api/core/FlowFile.h +++ b/extension-framework/cpp-extension-lib/include/api/core/FlowFile.h @@ -26,7 +26,11 @@ namespace org::apache::nifi::minifi::api::core { struct EnsureMovedFromDeleter { void operator()(MinifiFlowFile* ff) { if (ff) { - throw std::logic_error("Each flowfile should be either transferred or removed"); + if (std::uncaught_exceptions()) { + // there is already an exception in progress, do not terminate the process (although there are scenarios we could throw here) + } else { + throw std::logic_error("Each flowfile should be either transferred or removed"); + } } } }; diff --git a/extensions/llamacpp/CMakeLists.txt b/extensions/llamacpp/CMakeLists.txt index 421143f692..1de6be4589 100644 --- a/extensions/llamacpp/CMakeLists.txt +++ b/extensions/llamacpp/CMakeLists.txt @@ -31,7 +31,7 @@ add_minifi_library(minifi-llamacpp SHARED ${SOURCES}) target_include_directories(minifi-llamacpp PUBLIC "${CMAKE_SOURCE_DIR}/extensions/llamacpp") target_include_directories(minifi-llamacpp PUBLIC "${LLAMACPP_INCLUDE_DIRS}") -target_link_libraries(minifi-llamacpp minifi-cpp-extension-lib llama) +target_link_libraries(minifi-llamacpp minifi-cpp-extension-lib llama mtmd llama-common) register_c_api_extension(minifi-llamacpp "LLAMACPP EXTENSION" LLAMACPP-EXTENSION "Provides llama.cpp support" "extensions/llamacpp/tests") diff --git a/extensions/llamacpp/processors/DefaultLlamaContext.cpp b/extensions/llamacpp/processors/DefaultLlamaContext.cpp index 23559fb7c9..d859b139ee 100644 --- a/extensions/llamacpp/processors/DefaultLlamaContext.cpp +++ b/extensions/llamacpp/processors/DefaultLlamaContext.cpp @@ -16,8 +16,12 @@ */ #include "DefaultLlamaContext.h" + +#include + #include "minifi-cpp/Exception.h" #include "fmt/format.h" +#include "mtmd/mtmd-helper.h" namespace org::apache::nifi::minifi::extensions::llamacpp::processors { @@ -36,17 +40,18 @@ std::vector tokenizeInput(const llama_vocab* vocab, const std::stri return tokenized_input; } -constexpr size_t DEFAULT_BUFFER_SIZE = 4096; - } // namespace -DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path, const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params) { +DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path, const std::optional& multimodal_model_path, + const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params, const std::shared_ptr& logger) { llama_model_ = llama_model_load_from_file(model_path.string().c_str(), llama_model_default_params()); // NOLINT(cppcoreguidelines-prefer-member-initializer) if (!llama_model_) { throw Exception(ExceptionType::PROCESS_SCHEDULE_EXCEPTION, fmt::format("Failed to load model from '{}'", model_path.string())); } + chat_template_ = common_chat_templates_init(llama_model_, ""); + llama_context_params ctx_params = llama_context_default_params(); ctx_params.n_ctx = llama_ctx_params.n_ctx; ctx_params.n_batch = llama_ctx_params.n_batch; @@ -54,7 +59,7 @@ DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path ctx_params.n_seq_max = llama_ctx_params.n_seq_max; ctx_params.n_threads = llama_ctx_params.n_threads; ctx_params.n_threads_batch = llama_ctx_params.n_threads_batch; - ctx_params.flash_attn = false; + ctx_params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED; llama_ctx_ = llama_init_from_model(llama_model_, ctx_params); auto sparams = llama_sampler_chain_default_params(); @@ -73,9 +78,27 @@ DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path llama_sampler_chain_add(llama_sampler_, llama_sampler_init_temp(*llama_sampler_params.temperature)); } llama_sampler_chain_add(llama_sampler_, llama_sampler_init_dist(LLAMA_DEFAULT_SEED)); + + if (!multimodal_model_path) { + logger->log_info("No multimodal model path provided"); + return; + } + + mtmd_context_params mparams = mtmd_context_params_default(); + mparams.use_gpu = false; + mparams.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED; + + multimodal_ctx_ = mtmd_init_from_file(multimodal_model_path->string().c_str(), llama_model_, mparams); + if (!multimodal_ctx_) { + throw Exception(ExceptionType::PROCESS_SCHEDULE_EXCEPTION, fmt::format("Failed to load multimodal model from '{}'", multimodal_model_path->string())); + } + + logger->log_info("Successfully loaded multimodal model from '{}'", multimodal_model_path->string()); } DefaultLlamaContext::~DefaultLlamaContext() { + mtmd_free(multimodal_ctx_); + multimodal_ctx_ = nullptr; llama_sampler_free(llama_sampler_); llama_sampler_ = nullptr; llama_free(llama_ctx_); @@ -85,47 +108,96 @@ DefaultLlamaContext::~DefaultLlamaContext() { } std::optional DefaultLlamaContext::applyTemplate(const std::vector& messages) { - std::vector llama_messages; - llama_messages.reserve(messages.size()); - std::transform(messages.begin(), messages.end(), std::back_inserter(llama_messages), - [](const LlamaChatMessage& msg) { return llama_chat_message{.role = msg.role.c_str(), .content = msg.content.c_str()}; }); - std::string text; - text.resize(DEFAULT_BUFFER_SIZE); - const char * chat_template = llama_model_chat_template(llama_model_, nullptr); - int32_t res_size = llama_chat_apply_template(chat_template, llama_messages.data(), llama_messages.size(), true, text.data(), gsl::narrow(text.size())); - if (res_size < 0) { + if (!chat_template_) { return std::nullopt; } - if (res_size > gsl::narrow(text.size())) { - text.resize(res_size); - res_size = llama_chat_apply_template(chat_template, llama_messages.data(), llama_messages.size(), true, text.data(), gsl::narrow(text.size())); - if (res_size < 0) { - return std::nullopt; - } + common_chat_templates_inputs inputs; + for (auto& msg : messages) { + common_chat_msg chat_msg; + chat_msg.role = msg.role; + chat_msg.content = msg.content; + inputs.messages.push_back(std::move(chat_msg)); } - text.resize(res_size); + inputs.enable_thinking = false; // TODO(adebreceni): MINIFICPP-2800 common_chat_templates_support_enable_thinking(chat_template_.get()); - return text; + return common_chat_templates_apply(chat_template_.get(), inputs).prompt; } -std::expected DefaultLlamaContext::generate(const std::string& input, std::function token_handler) { +namespace { + +struct mtmd_bitmap_deleter { + void operator()(mtmd_bitmap* val) { mtmd_bitmap_free(val); } +}; +using unique_bitmap_ptr = std::unique_ptr; + +struct mtmd_input_chunks_deleter { + void operator()(mtmd_input_chunks* val) { mtmd_input_chunks_free(val); } +}; +using unique_mtmd_input_chunks_ptr = std::unique_ptr; + +} // namespace + +std::expected DefaultLlamaContext::generate(const std::string& prompt, const std::vector>& files, + std::function token_handler) { GenerationResult result{}; auto start_time = std::chrono::steady_clock::now(); + llama_memory_seq_rm(llama_get_memory(llama_ctx_), 0, -1, -1); const llama_vocab * vocab = llama_model_get_vocab(llama_model_); - std::vector tokenized_input = tokenizeInput(vocab, input); - result.num_tokens_in = gsl::narrow(tokenized_input.size()); + llama_pos n_past = 0; + std::vector tokenized_input; + llama_batch batch = llama_batch_init(1, 0, 1); + auto batch_deleter = gsl::finally([&] {llama_batch_free(batch);}); + batch.n_tokens = 1; + batch.n_seq_id[0] = 1; + batch.seq_id[0][0] = 0; + batch.logits[0] = true; + int32_t decode_status = 0; + if (multimodal_ctx_) { + if (files.empty()) { + return std::unexpected{"Multimodal input requires at least one file"}; + } + std::vector bitmaps; + for (auto& file : files) { + unique_bitmap_ptr bitmap{mtmd_helper_bitmap_init_from_buf(multimodal_ctx_, reinterpret_cast(file.data()), file.size())}; + if (!bitmap) { + throw Exception(PROCESSOR_EXCEPTION, "Failed to create multimodal bitmap from buffer"); + } + bitmaps.push_back(std::move(bitmap)); + } + mtmd_input_text inp_txt = { + .text = prompt.c_str(), + .add_special = true, + .parse_special = true, + }; + unique_mtmd_input_chunks_ptr chunks{mtmd_input_chunks_init()}; + auto bitmap_c_ptrs = bitmaps | ranges::views::transform([] (auto& ptr) {return static_cast(ptr.get());}) | ranges::to(); + auto tokenized = mtmd_tokenize(multimodal_ctx_, chunks.get(), &inp_txt, bitmap_c_ptrs.data(), bitmap_c_ptrs.size()); + if (tokenized != 0) { + throw Exception(PROCESSOR_EXCEPTION, fmt::format("Failed to tokenize multimodal prompt, error: {}", tokenized)); + } + auto status = mtmd_helper_eval_chunks(multimodal_ctx_, llama_ctx_, chunks.get(), 0, 0, 1, true, &n_past); + if (status != 0) { + throw Exception(PROCESSOR_EXCEPTION, fmt::format("Failed to eval multimodal chunks, error: {}", status)); + } + } else { + if (!files.empty()) { + return std::unexpected{"Model is not configured for multimodal input"}; + } + try { + tokenized_input = tokenizeInput(vocab, prompt); + } catch (std::exception& e) { + return std::unexpected{fmt::format("Error during tokenization: {}", e.what())}; + } catch (...) { + return std::unexpected{"Unknown error during tokenization"}; + } + n_past = gsl::narrow(tokenized_input.size()); + decode_status = llama_decode(llama_ctx_, llama_batch_get_one(tokenized_input.data(), n_past)); + } + result.num_tokens_in = gsl::narrow(n_past); - llama_batch batch = llama_batch_get_one(tokenized_input.data(), gsl::narrow(tokenized_input.size())); llama_token new_token_id = 0; bool first_token_generated = false; - while (true) { - int32_t res = llama_decode(llama_ctx_, batch); - if (res == 1) { - return std::unexpected{"Could not find a KV slot for the batch (try reducing the size of the batch or increase the context)"}; - } else if (res < 0) { - return std::unexpected{"Error occurred while executing llama decode"}; - } - + while (decode_status == 0) { new_token_id = llama_sampler_sample(llama_sampler_, llama_ctx_, -1); if (!first_token_generated) { result.time_to_first_token = std::chrono::duration_cast(std::chrono::steady_clock::now() - start_time); @@ -147,8 +219,22 @@ std::expected DefaultLlamaContext::generate(const gsl_Assert(len < 128); std::string_view token_str{buf.data(), gsl::narrow(len)}; - batch = llama_batch_get_one(&new_token_id, 1); + batch.token[0] = new_token_id; + batch.pos[0] = n_past; + ++n_past; token_handler(token_str); + + decode_status = llama_decode(llama_ctx_, batch); + } + + if (decode_status == 1) { + return std::unexpected("Could not find a KV slot for the batch (try reducing the size of the batch or increase the context)"); + } + if (decode_status == 2) { + return std::unexpected("Llama decode aborted"); + } + if (decode_status < 0) { + return std::unexpected("Error occurred while executing llama decode"); } result.tokens_per_second = diff --git a/extensions/llamacpp/processors/DefaultLlamaContext.h b/extensions/llamacpp/processors/DefaultLlamaContext.h index 94c3346b2f..279187b404 100644 --- a/extensions/llamacpp/processors/DefaultLlamaContext.h +++ b/extensions/llamacpp/processors/DefaultLlamaContext.h @@ -19,12 +19,16 @@ #include "LlamaContext.h" #include "llama.h" #include "LlamaBackendInitializer.h" +#include "chat.h" +#include "mtmd/mtmd.h" +#include "minifi-cpp/core/logging/Logger.h" namespace org::apache::nifi::minifi::extensions::llamacpp::processors { class DefaultLlamaContext : public LlamaContext { public: - DefaultLlamaContext(const std::filesystem::path& model_path, const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params); + DefaultLlamaContext(const std::filesystem::path& model_path, const std::optional& multimodal_model_path, + const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params, const std::shared_ptr& logger); DefaultLlamaContext(const DefaultLlamaContext&) = delete; DefaultLlamaContext(DefaultLlamaContext&&) = delete; DefaultLlamaContext& operator=(const DefaultLlamaContext&) = delete; @@ -32,12 +36,15 @@ class DefaultLlamaContext : public LlamaContext { ~DefaultLlamaContext() override; std::optional applyTemplate(const std::vector& messages) override; - std::expected generate(const std::string& input, std::function token_handler) override; + std::expected generate(const std::string& prompt, const std::vector>& files, + std::function token_handler) override; private: const LlamaBackendInitializer& llama_context_initializer_ = LlamaBackendInitializer::get(); llama_model* llama_model_{}; + common_chat_templates_ptr chat_template_; llama_context* llama_ctx_{}; + mtmd_context* multimodal_ctx_{}; llama_sampler* llama_sampler_{}; }; diff --git a/extensions/llamacpp/processors/LlamaContext.h b/extensions/llamacpp/processors/LlamaContext.h index a7cd2eb44e..557b270268 100644 --- a/extensions/llamacpp/processors/LlamaContext.h +++ b/extensions/llamacpp/processors/LlamaContext.h @@ -59,7 +59,8 @@ struct GenerationResult { class LlamaContext { public: virtual std::optional applyTemplate(const std::vector& messages) = 0; - virtual std::expected generate(const std::string& input, std::function token_handler) = 0; + virtual std::expected generate(const std::string& input, const std::vector>& files, + std::function token_handler) = 0; virtual ~LlamaContext() = default; }; diff --git a/extensions/llamacpp/processors/RunLlamaCppInference.cpp b/extensions/llamacpp/processors/RunLlamaCppInference.cpp index 5927a0199e..ede77cd062 100644 --- a/extensions/llamacpp/processors/RunLlamaCppInference.cpp +++ b/extensions/llamacpp/processors/RunLlamaCppInference.cpp @@ -31,7 +31,9 @@ namespace org::apache::nifi::minifi::extensions::llamacpp::processors { MinifiStatus RunLlamaCppInference::onScheduleImpl(api::core::ProcessContext& context) { model_path_.clear(); model_path_ = api::utils::parseProperty(context, ModelPath); + multimodal_model_path_ = api::utils::parseOptionalProperty(context, MultiModalModelPath); system_prompt_ = context.getProperty(SystemPrompt).value_or(""); + output_attribute_ = api::utils::parseOptionalProperty(context, OutputAttributeName); LlamaSamplerParams llama_sampler_params; llama_sampler_params.temperature = api::utils::parseOptionalFloatProperty(context, Temperature); @@ -51,9 +53,9 @@ MinifiStatus RunLlamaCppInference::onScheduleImpl(api::core::ProcessContext& con llama_ctx_params.n_threads_batch = gsl::narrow(api::utils::parseI64Property(context, ThreadsForBatchProcessing)); if (llama_context_provider_) { - llama_ctx_ = llama_context_provider_(model_path_, llama_sampler_params, llama_ctx_params); + llama_ctx_ = llama_context_provider_(model_path_, multimodal_model_path_, llama_sampler_params, llama_ctx_params); } else { - llama_ctx_ = std::make_unique(model_path_, llama_sampler_params, llama_ctx_params); + llama_ctx_ = std::make_unique(model_path_, multimodal_model_path_, llama_sampler_params, llama_ctx_params, logger_); } return MINIFI_STATUS_SUCCESS; @@ -76,10 +78,16 @@ MinifiStatus RunLlamaCppInference::onTriggerImpl(api::core::ProcessContext& cont auto prompt = context.getProperty(Prompt, &flow_file).value_or(""); auto read_result = session.readBuffer(flow_file); + std::vector> files; std::string input_data_and_prompt; if (!read_result.empty()) { input_data_and_prompt.append("Input data (or flow file content):\n"); - input_data_and_prompt.append({reinterpret_cast(read_result.data()), read_result.size()}); + if (multimodal_model_path_) { + input_data_and_prompt.append(mtmd_default_marker()); + files.push_back(std::move(read_result)); + } else { + input_data_and_prompt.append({reinterpret_cast(read_result.data()), read_result.size()}); + } input_data_and_prompt.append("\n\n"); } input_data_and_prompt.append(prompt); @@ -111,7 +119,7 @@ MinifiStatus RunLlamaCppInference::onTriggerImpl(api::core::ProcessContext& cont auto start_time = std::chrono::steady_clock::now(); std::string text; - auto generation_result = llama_ctx_->generate(*input, [&] (std::string_view token) { + auto generation_result = llama_ctx_->generate(*input, files, [&] (std::string_view token) { text += token; }); @@ -133,7 +141,12 @@ MinifiStatus RunLlamaCppInference::onTriggerImpl(api::core::ProcessContext& cont session.setAttribute(flow_file, LlamaCppTimeToFirstToken.name, std::to_string(generation_result->time_to_first_token.count()) + " ms"); session.setAttribute(flow_file, LlamaCppTokensPerSecond.name, fmt::format("{:.2f}", generation_result->tokens_per_second)); - session.writeBuffer(flow_file, text); + if (output_attribute_) { + session.setAttribute(flow_file, output_attribute_.value(), text); + } else { + session.writeBuffer(flow_file, text); + } + session.transfer(std::move(flow_file), Success); return MINIFI_STATUS_SUCCESS; diff --git a/extensions/llamacpp/processors/RunLlamaCppInference.h b/extensions/llamacpp/processors/RunLlamaCppInference.h index 76ae3a1f65..21017db0e9 100644 --- a/extensions/llamacpp/processors/RunLlamaCppInference.h +++ b/extensions/llamacpp/processors/RunLlamaCppInference.h @@ -29,7 +29,7 @@ namespace org::apache::nifi::minifi::extensions::llamacpp::processors { using LlamaContextProvider = - std::function(const std::filesystem::path& model_path, const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params)>; + std::function(const std::filesystem::path& model_path, const std::optional& multimodal_model_path, const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params)>; class RunLlamaCppInferenceMetrics { public: @@ -58,6 +58,15 @@ class RunLlamaCppInference : public api::core::ProcessorImpl { EXTENSIONAPI static constexpr auto ModelPath = core::PropertyDefinitionBuilder<>::createProperty("Model Path") .withDescription("The filesystem path of the model file in gguf format.") .isRequired(true) + .withValidator(core::StandardPropertyValidators::NON_BLANK_VALIDATOR) + .build(); + EXTENSIONAPI static constexpr auto OutputAttributeName = core::PropertyDefinitionBuilder<>::createProperty("Output Attribute Name") + .withDescription("Specify the attribute to use as output, if not provided, the content is overridden instead.") + .withValidator(core::StandardPropertyValidators::NON_BLANK_VALIDATOR) + .build(); + EXTENSIONAPI static constexpr auto MultiModalModelPath = core::PropertyDefinitionBuilder<>::createProperty("MultiModal Model Path") + .withDescription("The filesystem path of the multimodal model (visual, audio) file in gguf format.") + .withValidator(core::StandardPropertyValidators::NON_BLANK_VALIDATOR) .build(); EXTENSIONAPI static constexpr auto Temperature = core::PropertyDefinitionBuilder<>::createProperty("Temperature") .withDescription("The temperature to use for sampling.") @@ -128,6 +137,8 @@ class RunLlamaCppInference : public api::core::ProcessorImpl { EXTENSIONAPI static constexpr auto Properties = std::to_array({ ModelPath, + OutputAttributeName, + MultiModalModelPath, Temperature, TopK, TopP, @@ -167,7 +178,9 @@ class RunLlamaCppInference : public api::core::ProcessorImpl { void increaseTokensOut(uint64_t token_count); std::string model_path_; + std::optional multimodal_model_path_; std::string system_prompt_; + std::optional output_attribute_; LlamaContextProvider llama_context_provider_; std::unique_ptr llama_ctx_; diff --git a/extensions/llamacpp/tests/RunLlamaCppInferenceTests.cpp b/extensions/llamacpp/tests/RunLlamaCppInferenceTests.cpp index 9fa893fba3..2b97976fa3 100644 --- a/extensions/llamacpp/tests/RunLlamaCppInferenceTests.cpp +++ b/extensions/llamacpp/tests/RunLlamaCppInferenceTests.cpp @@ -37,10 +37,16 @@ class MockLlamaContext : public processors::LlamaContext { return "Test input"; } - std::expected generate(const std::string& input, std::function token_handler) override { + std::expected generate(const std::string& input, const std::vector>& files, + std::function token_handler) override { if (fail_generation_) { return std::unexpected{"Generation failed"}; } + if (multimodal_) { + if (files.empty()) { + return std::unexpected{"Files empty"}; + } + } processors::GenerationResult result; input_ = input; token_handler("Test "); @@ -69,7 +75,12 @@ class MockLlamaContext : public processors::LlamaContext { fail_apply_template_ = true; } + void setMultimodal() { + multimodal_ = true; + } + private: + bool multimodal_{false}; bool fail_generation_{false}; bool fail_apply_template_{false}; std::vector messages_; @@ -84,7 +95,7 @@ TEST_CASE("Prompt is generated correctly with default parameters") { processors::LlamaContextParams test_context_params; minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor( core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory::getLogger()}, - [&](const std::filesystem::path& model_path, const processors::LlamaSamplerParams& sampler_params, const processors::LlamaContextParams& context_params) { + [&](const std::filesystem::path& model_path, const std::optional&, const processors::LlamaSamplerParams& sampler_params, const processors::LlamaContextParams& context_params) { test_model_path = model_path; test_sampler_params = sampler_params; test_context_params = context_params; @@ -130,7 +141,7 @@ TEST_CASE("Prompt is generated correctly with custom parameters") { processors::LlamaContextParams test_context_params; minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor( core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory::getLogger()}, - [&](const std::filesystem::path& model_path, const processors::LlamaSamplerParams& sampler_params, const processors::LlamaContextParams& context_params) { + [&](const std::filesystem::path& model_path, const std::optional&, const processors::LlamaSamplerParams& sampler_params, const processors::LlamaContextParams& context_params) { test_model_path = model_path; test_sampler_params = sampler_params; test_context_params = context_params; @@ -182,7 +193,7 @@ TEST_CASE("Empty flow file does not include input data in prompt") { auto mock_llama_context_ptr = mock_llama_context.get(); minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor( core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory::getLogger()}, - [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) { + [&](const std::filesystem::path&, const std::optional&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) { return std::move(mock_llama_context); })); LogTestController::getInstance().setTrace(); @@ -206,7 +217,7 @@ TEST_CASE("Empty flow file does not include input data in prompt") { TEST_CASE("Invalid values for optional double type properties throw exception") { minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor( core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory::getLogger()}, - [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) { + [&](const std::filesystem::path&, const std::optional&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) { return std::make_unique(); })); LogTestController::getInstance().setTrace(); @@ -236,7 +247,7 @@ TEST_CASE("Top K property empty and invalid values are handled properly") { std::optional test_top_k = 0; minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor( core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory::getLogger()}, - [&](const std::filesystem::path&, const processors::LlamaSamplerParams& sampler_params, const processors::LlamaContextParams&) { + [&](const std::filesystem::path&, const std::optional&, const processors::LlamaSamplerParams& sampler_params, const processors::LlamaContextParams&) { test_top_k = sampler_params.top_k; return std::make_unique(); })); @@ -269,7 +280,7 @@ TEST_CASE("Error handling during generation and applying template") { minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor( core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory::getLogger()}, - [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) { + [&](const std::filesystem::path&, const std::optional&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) { return std::move(mock_llama_context); })); LogTestController::getInstance().setTrace(); @@ -287,7 +298,7 @@ TEST_CASE("Error handling during generation and applying template") { TEST_CASE("Route flow file to failure when prompt and input data is empty") { minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor( core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory::getLogger()}, - [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) { + [&](const std::filesystem::path&, const std::optional&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) { return std::make_unique(); })); LogTestController::getInstance().setTrace(); @@ -307,7 +318,7 @@ TEST_CASE("System prompt is optional") { auto mock_llama_context_ptr = mock_llama_context.get(); minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor( core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory::getLogger()}, - [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) { + [&](const std::filesystem::path&, const std::optional&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) { return std::move(mock_llama_context); })); LogTestController::getInstance().setTrace(); @@ -329,7 +340,7 @@ TEST_CASE("System prompt is optional") { TEST_CASE("Test output metrics") { auto processor = minifi::test::utils::make_custom_c_processor( core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory::getLogger()}, - [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) { + [&](const std::filesystem::path&, const std::optional&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) { return std::make_unique(); }); auto processor_metrics = processor->getMetrics(); @@ -357,4 +368,63 @@ TEST_CASE("Test output metrics") { CHECK(c2_metrics[0].children[c2_metrics[0].children.size() - 1].value.to_string() == "6"); } +TEST_CASE("Prompt is generated correctly for multimodal inference") { + auto mock_llama_context = std::make_unique(); + auto mock_llama_context_ptr = mock_llama_context.get(); + std::filesystem::path test_model_path; + std::optional test_model_path_option; + minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor( + core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory::getLogger()}, + [&](const std::filesystem::path& model_path, const std::optional& multimodal_model_path, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) { + test_model_path = model_path; + test_model_path_option = multimodal_model_path; + if (multimodal_model_path) { + mock_llama_context->setMultimodal(); + } + return std::move(mock_llama_context); + })); + LogTestController::getInstance().setTrace(); + REQUIRE(controller.getProcessor()->setProperty(processors::RunLlamaCppInference::ModelPath.name, "/path/to/model")); + REQUIRE(controller.getProcessor()->setProperty(processors::RunLlamaCppInference::MultiModalModelPath.name, "/path/to/mm-model")); + REQUIRE(controller.getProcessor()->setProperty(processors::RunLlamaCppInference::Prompt.name, "What is on the image?")); + + + SECTION("Flowfile contains data") { + auto results = controller.trigger(minifi::test::InputFlowFileData{.content = "", .attributes = {}}); + CHECK(test_model_path == "/path/to/model"); + CHECK(test_model_path_option == "/path/to/mm-model"); + REQUIRE(results.at(processors::RunLlamaCppInference::Success).size() == 1); + auto& output_flow_file = results.at(processors::RunLlamaCppInference::Success)[0]; + CHECK(controller.plan->getContent(output_flow_file) == "Test generated content"); + CHECK(mock_llama_context_ptr->getInput() == "Test input"); + REQUIRE(mock_llama_context_ptr->getMessages().size() == 2); + CHECK(mock_llama_context_ptr->getMessages()[1].role == "user"); + CHECK(mock_llama_context_ptr->getMessages()[1].content == "Input data (or flow file content):\n<__media__>\n\nWhat is on the image?"); + } + + SECTION("Flowfile is empty") { + auto results = controller.trigger(minifi::test::InputFlowFileData{.content = "", .attributes = {}}); + REQUIRE(results.at(processors::RunLlamaCppInference::Failure).size() == 1); + } +} + +TEST_CASE("Can write content to attribute") { + minifi::test::SingleProcessorTestController controller(minifi::test::utils::make_custom_c_processor( + core::ProcessorMetadata{utils::Identifier{}, "RunLlamaCppInference", logging::LoggerFactory::getLogger()}, + [&](const std::filesystem::path&, const std::optional&, const processors::LlamaSamplerParams&, const processors::LlamaContextParams&) { + return std::make_unique(); + })); + LogTestController::getInstance().setTrace(); + REQUIRE(controller.getProcessor()->setProperty(processors::RunLlamaCppInference::ModelPath.name, "/path/to/model")); + REQUIRE(controller.getProcessor()->setProperty(processors::RunLlamaCppInference::Prompt.name, "What is love?")); + REQUIRE(controller.getProcessor()->setProperty(processors::RunLlamaCppInference::OutputAttributeName.name, "DontHurtMe")); + + + auto results = controller.trigger(minifi::test::InputFlowFileData{.content = "Some content", .attributes = {}}); + REQUIRE(results.at(processors::RunLlamaCppInference::Success).size() == 1); + auto& output_flow_file = results.at(processors::RunLlamaCppInference::Success)[0]; + CHECK(controller.plan->getContent(output_flow_file) == "Some content"); + CHECK(output_flow_file->getAttribute("DontHurtMe") == "Test generated content"); +} + } // namespace org::apache::nifi::minifi::extensions::llamacpp::test diff --git a/extensions/llamacpp/tests/features/environment.py b/extensions/llamacpp/tests/features/environment.py index 9f23b76564..bcef201bf4 100644 --- a/extensions/llamacpp/tests/features/environment.py +++ b/extensions/llamacpp/tests/features/environment.py @@ -14,6 +14,7 @@ # limitations under the License. from textwrap import dedent +from pathlib import Path from minifi_behave.containers.docker_image_builder import DockerImageBuilder from minifi_behave.core.hooks import common_before_scenario from minifi_behave.core.hooks import common_after_scenario @@ -29,7 +30,9 @@ def before_all(context: MinifiTestContext): dockerfile = dedent("""\ FROM {base_image} - RUN mkdir {models_path} && wget https://huggingface.co/bartowski/Qwen2-0.5B-Instruct-GGUF/resolve/main/Qwen2-0.5B-Instruct-IQ3_M.gguf --directory-prefix={models_path} + RUN mkdir {models_path} + RUN wget https://huggingface.co/bartowski/Qwen2-VL-2B-Instruct-GGUF/resolve/main/Qwen2-VL-2B-Instruct-Q3_K_M.gguf --directory-prefix={models_path} + RUN wget https://huggingface.co/bartowski/Qwen2-VL-2B-Instruct-GGUF/resolve/main/mmproj-Qwen2-VL-2B-Instruct-f16.gguf --directory-prefix={models_path} """.format(base_image=minifi_container_image, models_path='/tmp/models')) builder = DockerImageBuilder( @@ -42,6 +45,7 @@ def before_all(context: MinifiTestContext): def before_scenario(context: MinifiTestContext, scenario): context.minifi_container_image = "apacheminificpp:llama" common_before_scenario(context, scenario) + context.resource_dir = Path(__file__).resolve().parent / "resources" def after_scenario(context, scenario): diff --git a/extensions/llamacpp/tests/features/llamacpp.feature b/extensions/llamacpp/tests/features/llamacpp.feature index 437461c08f..b905ab4e69 100644 --- a/extensions/llamacpp/tests/features/llamacpp.feature +++ b/extensions/llamacpp/tests/features/llamacpp.feature @@ -18,11 +18,27 @@ Feature: Run language model inference using LlamaCpp processor Scenario: Test inference with a small model Given a GenerateFlowFile processor with the "File Size" property set to "0B" - And a RunLlamaCppInference processor with the "Model Path" property set to "/tmp/models/Qwen2-0.5B-Instruct-IQ3_M.gguf" + And a RunLlamaCppInference processor with the "Model Path" property set to "/tmp/models/Qwen2-VL-2B-Instruct-Q3_K_M.gguf" And the "Prompt" property of the RunLlamaCppInference processor is set to "Repeat after me: banana banana banana" + And the "Temperature" property of the RunLlamaCppInference processor is set to "0" And a LogAttribute processor with the "Log Payload" property set to "true" And the "success" relationship of the GenerateFlowFile processor is connected to the RunLlamaCppInference And the "success" relationship of the RunLlamaCppInference processor is connected to the LogAttribute When all instances start up Then the Minifi logs contain the following message: "banana" in less than 60 seconds + + Scenario: Test multimodal inference with a small model + Given a GetFile processor with the "Input Directory" property set to "/tmp/input" + And a directory at "/tmp/input" has a file with the content from "test-image.png" + And a RunLlamaCppInference processor with the "Model Path" property set to "/tmp/models/Qwen2-VL-2B-Instruct-Q3_K_M.gguf" + And the "Prompt" property of the RunLlamaCppInference processor is set to "Output only what is written on the image." + And the "MultiModal Model Path" property of the RunLlamaCppInference processor is set to "/tmp/models/mmproj-Qwen2-VL-2B-Instruct-f16.gguf" + And the "Temperature" property of the RunLlamaCppInference processor is set to "0" + And a PutFile processor with the "Directory" property set to "/tmp/output" + And the "success" relationship of the GetFile processor is connected to the RunLlamaCppInference + And the "success" relationship of the RunLlamaCppInference processor is connected to the PutFile + + When all instances start up + Then a single file with the content "minifi" is placed in the "/tmp/output" directory in less than 60 seconds + diff --git a/extensions/llamacpp/tests/features/resources/test-image.png b/extensions/llamacpp/tests/features/resources/test-image.png new file mode 100644 index 0000000000..f6d7720d5a Binary files /dev/null and b/extensions/llamacpp/tests/features/resources/test-image.png differ diff --git a/thirdparty/llamacpp/cpp-23-fixes.patch b/thirdparty/llamacpp/cpp-23-fixes.patch deleted file mode 100644 index 0e84e43956..0000000000 --- a/thirdparty/llamacpp/cpp-23-fixes.patch +++ /dev/null @@ -1,24 +0,0 @@ -From 072bd8ce7e10a0fffb1e2bc755c2964e472909ed Mon Sep 17 00:00:00 2001 -From: Martin Zink -Date: Tue, 22 Jul 2025 12:49:42 +0200 -Subject: [PATCH] c++23 fixes - ---- - src/llama-hparams.cpp | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp -index c6c67d26..db36de4d 100644 ---- a/src/llama-hparams.cpp -+++ b/src/llama-hparams.cpp -@@ -1,5 +1,7 @@ - #include "llama-hparams.h" - -+#include -+ - #include "ggml.h" - - void llama_hparams::set_swa_pattern(uint32_t n_pattern) { --- -2.39.5 (Apple Git-154) - diff --git a/thirdparty/llamacpp/lu8_macro_fix.patch b/thirdparty/llamacpp/lu8_macro_fix.patch deleted file mode 100644 index a1b92d28b3..0000000000 --- a/thirdparty/llamacpp/lu8_macro_fix.patch +++ /dev/null @@ -1,17 +0,0 @@ -diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp -index dd27a381..47550954 100644 ---- a/src/llama-chat.cpp -+++ b/src/llama-chat.cpp -@@ -6,11 +6,7 @@ - #include - #include - --#if __cplusplus >= 202000L -- #define LU8(x) (const char*)(u8##x) --#else -- #define LU8(x) u8##x --#endif -+#define LU8(x) reinterpret_cast(u8##x) - - // trim whitespace from the beginning and end of a string - static std::string trim(const std::string & str) { diff --git a/thirdparty/llamacpp/mtmd-fix.patch b/thirdparty/llamacpp/mtmd-fix.patch new file mode 100644 index 0000000000..6ac631d4e5 --- /dev/null +++ b/thirdparty/llamacpp/mtmd-fix.patch @@ -0,0 +1,44 @@ +diff --color=auto -rupN llama.cpp-b8944/CMakeLists.txt llama.cpp-b8944-patched/CMakeLists.txt +--- llama.cpp-b8944/CMakeLists.txt 2026-04-27 08:30:55 ++++ llama.cpp-b8944-patched/CMakeLists.txt 2026-04-27 13:49:25 +@@ -191,6 +191,7 @@ add_subdirectory(src) + # + + add_subdirectory(src) ++add_subdirectory(tools/mtmd) + + # + # utils, programs, examples and tests +diff --color=auto -rupN llama.cpp-b8944/common/ngram-mod.cpp llama.cpp-b8944-patched/common/ngram-mod.cpp +--- llama.cpp-b8944/common/ngram-mod.cpp 2026-04-27 08:30:55 ++++ llama.cpp-b8944-patched/common/ngram-mod.cpp 2026-04-30 08:28:08 +@@ -1,4 +1,5 @@ + #include "ngram-mod.h" ++#include + + // + // common_ngram_mod +diff --color=auto -rupN llama.cpp-b8944/tools/mtmd/CMakeLists.txt llama.cpp-b8944-patched/tools/mtmd/CMakeLists.txt +--- llama.cpp-b8944/tools/mtmd/CMakeLists.txt 2026-04-27 08:30:55 ++++ llama.cpp-b8944-patched/tools/mtmd/CMakeLists.txt 2026-04-27 13:50:45 +@@ -101,20 +101,6 @@ endif() + endif() + endif() + +-add_executable(llama-llava-cli deprecation-warning.cpp) +-add_executable(llama-gemma3-cli deprecation-warning.cpp) +-add_executable(llama-minicpmv-cli deprecation-warning.cpp) +-add_executable(llama-qwen2vl-cli deprecation-warning.cpp) +- +-set(TARGET llama-mtmd-cli) +-add_executable (${TARGET} mtmd-cli.cpp) +-set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli) +-if(LLAMA_TOOLS_INSTALL) +- install(TARGETS ${TARGET} RUNTIME) +-endif() +-target_link_libraries (${TARGET} PRIVATE llama-common mtmd Threads::Threads) +-target_compile_features(${TARGET} PRIVATE cxx_std_17) +- + # mtmd-debug tool + add_executable(llama-mtmd-debug debug/mtmd-debug.cpp) + set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug)