software-mansion · IgorSwat · Feb 13, 2026 · Feb 12, 2026 · Feb 12, 2026 · Feb 13, 2026
diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx
@@ -101,7 +101,7 @@ function VoiceChatScreen() {
   return !llm.isReady || !speechToText.isReady ? (
     <Spinner
       visible={!llm.isReady || !speechToText.isReady}
-      textContent={`Loading the model ${(llm.downloadProgress * 100).toFixed(0)} %\nLoading the speech model ${(speechToText.downloadProgress * 100).toFixed(0)} %`}
+      textContent={`Loading the LLM model ${(llm.downloadProgress * 100).toFixed(0)} %\nLoading the speech model ${(speechToText.downloadProgress * 100).toFixed(0)} %`}
     />
   ) : (
     <TouchableWithoutFeedback onPress={Keyboard.dismiss}>

diff --git a/packages/react-native-executorch/android/libs/classes.jar b/packages/react-native-executorch/android/libs/classes.jar
diff --git a/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.cpp b/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.cpp
@@ -9,7 +9,7 @@
 
 namespace rnexecutorch {
 using namespace facebook;
-using namespace executorch::extension::constants;
+using namespace executorch::extension::llm;
 
 TokenizerModule::TokenizerModule(
     std::string source, std::shared_ptr<react::CallInvoker> callInvoker)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h b/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h
@@ -46,7 +46,7 @@ class BaseModel {
   // (unnecessary copies instead of working on JS memory). In this case
   // CallInvoker can be used to get jsi::Runtime, and use it in a safe manner.
   std::shared_ptr<react::CallInvoker> callInvoker;
-  std::unique_ptr<executorch::extension::Module> module_;
+  std::unique_ptr<Module> module_;
 
   std::size_t memorySizeLowerBound{0};
 

diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/SpeechToTextTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/SpeechToTextTest.cpp
@@ -74,7 +74,7 @@ TEST(S2TTranscribeTests, TranscribeReturnsValidChars) {
   auto result = model.transcribe(audio, "en", true);
   ASSERT_EQ(result.language, "en");
   EXPECT_GE(result.duration, 20.0f);
-  ASSERT_EQ(result.task, "transcription");
+  ASSERT_EQ(result.task, "transcribe");
   ASSERT_FALSE(result.segments.empty());
   ASSERT_FALSE(result.text.empty());
   for (char c : result.text) {

diff --git a/packages/react-native-executorch/common/runner/constants.h b/packages/react-native-executorch/common/runner/constants.h
@@ -7,7 +7,7 @@
  */
 #pragma once
 // constants for LLM runtime
-namespace executorch::extension::constants {
+namespace executorch::extension::llm {
 
 // Runtime metadata key constants
 inline constexpr auto kEnableDynamicShape = "enable_dynamic_shape";
@@ -27,4 +27,5 @@ inline constexpr auto kTextModelMethod = "text_decoder";
 
 inline constexpr auto numOfAddedBoSTokens = 0;
 inline constexpr auto numOfAddedEoSTokens = 0;
-} // namespace executorch::extension::constants
+
+} // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/irunner.h b/packages/react-native-executorch/common/runner/irunner.h
@@ -65,6 +65,10 @@ struct GenerationConfig {
 
   // Use KV_CACHE implementation (if implemented) or not
   bool enable_kv_cache = true;
+
+  // Number of eos and bos to add to the prompt
+  int32_t num_bos = 0;
+  int32_t num_eos = 0;
 };
 
 // Base interface for LLM runners

diff --git a/packages/react-native-executorch/common/runner/kernel_includes.h b/packages/react-native-executorch/common/runner/kernel_includes.h
@@ -15,8 +15,8 @@
 #pragma once
 
 // This list should be very conservative since most kernel .cpp files will
-// include these and depend on their transitive deps. Only add a header if 99%
-// of kernels would have included it anyway.
+// include these and depend on their transitive deps. Only add a header if
+// 99% of kernels would have included it anyway.
 #include <executorch/runtime/core/exec_aten/exec_aten.h> // IWYU pragma: export
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h> // IWYU pragma: export
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h> // IWYU pragma: export

diff --git a/packages/react-native-executorch/common/runner/runner.cpp b/packages/react-native-executorch/common/runner/runner.cpp
@@ -19,7 +19,7 @@
 
 namespace example {
 
-using namespace executorch::extension::constants;
+using namespace executorch::extension::llm;
 using ::executorch::extension::Module;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::Result;

diff --git a/packages/react-native-executorch/common/runner/sampler.h b/packages/react-native-executorch/common/runner/sampler.h
@@ -26,6 +26,8 @@ namespace extension {
 namespace llm {
 // A simple llama2 sampler.
 
+inline constexpr auto kTopp = 0.9f;
+
 template <typename T> struct ProbIndex {
   T prob;
   int32_t index;
@@ -65,3 +67,9 @@ using ::executorch::extension::llm::ProbIndex;
 using ::executorch::extension::llm::Sampler;
 } // namespace executor
 } // namespace torch
+
+namespace executorch::llm {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch::extension::llm` namespaces.
+using ::executorch::extension::llm::kTopp;
+} // namespace executorch::llm
diff --git a/packages/react-native-executorch/common/runner/stats.h b/packages/react-native-executorch/common/runner/stats.h
@@ -11,6 +11,7 @@
 #include "util.h"
 #include <cinttypes>
 #include <executorch/runtime/platform/log.h>
+#include <limits>
 #include <sstream>
 #include <string>
 
@@ -44,11 +45,19 @@ struct Stats {
   // inference_end_ms: End of inference/generation.
   long inference_end_ms;
   // Keep a running total of the time spent in sampling.
-  long aggregate_sampling_time_ms;
+  long aggregate_sampling_time_ms = 0;
   // Token count from prompt
   int64_t num_prompt_tokens;
   // Token count from generated (total - prompt)
   int64_t num_generated_tokens;
+  // GPU memory stats (optional; may be zero if not available)
+  // GPU memory stats (optional). Use sentinel UINT64_MAX / -1.0 to indicate
+  // "not available".
+  uint64_t gpu_total_bytes = std::numeric_limits<uint64_t>::max();
+  uint64_t gpu_free_before_load_bytes = std::numeric_limits<uint64_t>::max();
+  uint64_t gpu_free_after_load_bytes = std::numeric_limits<uint64_t>::max();
+  uint64_t gpu_free_after_generate_bytes = std::numeric_limits<uint64_t>::max();
+  double gpu_peak_usage_mb = -1.0;
   inline void on_sampling_begin() {
     aggregate_sampling_timer_start_timestamp = time_in_ms();
   }
@@ -75,6 +84,11 @@ struct Stats {
     aggregate_sampling_time_ms = 0;
     num_prompt_tokens = 0;
     num_generated_tokens = 0;
+    gpu_total_bytes = std::numeric_limits<uint64_t>::max();
+    gpu_free_before_load_bytes = std::numeric_limits<uint64_t>::max();
+    gpu_free_after_load_bytes = std::numeric_limits<uint64_t>::max();
+    gpu_free_after_generate_bytes = std::numeric_limits<uint64_t>::max();
+    gpu_peak_usage_mb = -1.0;
     aggregate_sampling_timer_start_timestamp = 0;
   }
 
@@ -93,7 +107,29 @@ inline std::string stats_to_json_string(const Stats &stats) {
      << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << ","
      << "\"first_token_ms\":" << stats.first_token_ms << ","
      << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms
-     << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
+     << ",";
+  // Only include GPU fields in the JSON if gpu_total_bytes is valid (not
+  // equal to sentinel -1)
+  if (stats.gpu_total_bytes != static_cast<uint64_t>(-1)) {
+    ss << "\"gpu_total_bytes\":" << stats.gpu_total_bytes;
+    if (stats.gpu_free_before_load_bytes != static_cast<uint64_t>(-1)) {
+      ss << ",\"gpu_free_before_load_bytes\":"
+         << stats.gpu_free_before_load_bytes;
+    }
+    if (stats.gpu_free_after_load_bytes != static_cast<uint64_t>(-1)) {
+      ss << ",\"gpu_free_after_load_bytes\":"
+         << stats.gpu_free_after_load_bytes;
+    }
+    if (stats.gpu_free_after_generate_bytes != static_cast<uint64_t>(-1)) {
+      ss << ",\"gpu_free_after_generate_bytes\":"
+         << stats.gpu_free_after_generate_bytes;
+    }
+    if (stats.gpu_peak_usage_mb >= 0.0) {
+      ss << ",\"gpu_peak_usage_mb\":" << stats.gpu_peak_usage_mb;
+    }
+    ss << ",";
+  }
+  ss << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
      << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}";
   return ss.str();
 }
@@ -145,6 +181,27 @@ inline void print_report(const Stats &stats) {
          stats.num_prompt_tokens + stats.num_generated_tokens,
          (double)stats.aggregate_sampling_time_ms /
              stats.SCALING_FACTOR_UNITS_PER_SECOND);
+
+  // GPU memory reporting (only meaningful if GPU fields were populated)
+  if (stats.gpu_total_bytes != static_cast<uint64_t>(-1)) {
+    ET_LOG(Info, "\tGPU total memory: %.2f MB",
+           stats.gpu_total_bytes / 1024.0 / 1024.0);
+    if (stats.gpu_free_before_load_bytes != static_cast<uint64_t>(-1)) {
+      ET_LOG(Info, "\tGPU free before load: %.2f MB",
+             stats.gpu_free_before_load_bytes / 1024.0 / 1024.0);
+    }
+    if (stats.gpu_free_after_load_bytes != static_cast<uint64_t>(-1)) {
+      ET_LOG(Info, "\tGPU free after load: %.2f MB",
+             stats.gpu_free_after_load_bytes / 1024.0 / 1024.0);
+    }
+    if (stats.gpu_free_after_generate_bytes != static_cast<uint64_t>(-1)) {
+      ET_LOG(Info, "\tGPU free after generate: %.2f MB",
+             stats.gpu_free_after_generate_bytes / 1024.0 / 1024.0);
+    }
+    if (stats.gpu_peak_usage_mb >= 0.0) {
+      ET_LOG(Info, "\tGPU peak usage: %.2f MB", stats.gpu_peak_usage_mb);
+    }
+  }
 }
 
 } // namespace llm

diff --git a/packages/react-native-executorch/common/runner/text_decoder_runner.cpp b/packages/react-native-executorch/common/runner/text_decoder_runner.cpp
@@ -32,15 +32,23 @@ TextDecoderRunner::TextDecoderRunner(Module *module, IOManager *io_manager,
 ::executorch::runtime::Result<executorch::aten::Tensor>
 TextDecoderRunner::step(TensorPtr &tokens, int64_t start_pos) {
   // ET_LOG(Info, "Input token %" PRIu64, input_token);
-  auto method_meta = ET_UNWRAP(module_->method_meta("forward"));
+  auto method_meta_result = module_->method_meta("forward");
+  if (!method_meta_result.ok()) {
+    return method_meta_result.error();
+  }
+  auto method_meta = std::move(*method_meta_result);
   // If only 1 input, we are not using kv cache
   bool use_kv_cache = method_meta.num_inputs() > 1;
 
   std::vector<int64_t> cache_positions;
 
   if (use_kv_cache) {
-    auto start_pos_tensor = ET_UNWRAP(populate_start_pos_or_cache_position(
-        module_, start_pos, cache_positions, tokens->numel(), "forward"));
+    auto start_pos_tensor_result = populate_start_pos_or_cache_position(
+        module_, start_pos, cache_positions, tokens->numel(), "forward");
+    if (!start_pos_tensor_result.ok()) {
+      return start_pos_tensor_result.error();
+    }
+    auto start_pos_tensor = std::move(*start_pos_tensor_result);
 
     std::vector<runtime::EValue> inputs;
     auto inputs_res = io_manager_->prepare_decode(tokens, start_pos_tensor);

diff --git a/packages/react-native-executorch/common/runner/text_prefiller.cpp b/packages/react-native-executorch/common/runner/text_prefiller.cpp
@@ -32,7 +32,7 @@
  }

  // Check if we need to chunk the prompt tokens
  int32_t num_prompt_tokens = prompt_tokens.size();

  // If prompt tokens exceed max_seq_len_, we need to chunk them
  if (num_prompt_tokens > max_seq_len_) {
@@ -41,7 +41,7 @@

    while (num_tokens_to_process < num_prompt_tokens) {
      auto num_tokens_to_prefill_with = std::min<int>(
          num_prompt_tokens - num_tokens_to_process, max_seq_len_);

      std::vector<uint64_t> prompt_tokens_to_process(
          num_tokens_to_prefill_with);
@@ -98,8 +98,11 @@
 
     // run the first token and get back logits tensor. Assuming the first token
     // is bos so don't callback.
-    auto logits_tensor =
-        ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos));
+    auto logits_result = text_decoder_runner_->step(tokens, start_pos);
+    if (!logits_result.ok()) {
+      return logits_result.error();
+    }
+    auto logits_tensor = std::move(*logits_result);
 
     pos += 1; // start the loop from index 1
     start_pos += 1;

diff --git a/packages/react-native-executorch/common/runner/text_token_generator.h b/packages/react-native-executorch/common/runner/text_token_generator.h
@@ -39,6 +39,10 @@ class TextTokenGenerator {
    * @param start_pos The start position of the new tokens, based on how many
    * prompt tokens is prefilled.
    * @param max_new_tokens Maximum number of new tokens to generate.
+   * @param temperature controls the randomness of predictions by scaling the
+   * logits before applying softmax. A higher temperature results in more
+   * random predictions, while a lower temperature results in more deterministic
+   * predictions.
    * @param token_callback what to do after a token is generated.
    * @return how many tokens are generated.
    */
@@ -113,7 +117,8 @@ class TextTokenGenerator {
       // We pass false, as we want don't want to skip special tokens e.g.
       // <think>
 
-      auto decodeResult = tokenizer_->decode(token_cache, false);
+      auto decodeResult =
+          tokenizer_->decode(token_cache, false); // NOTE: difference
       if (!decodeResult.ok()) {
         throw rnexecutorch::RnExecutorchError(
             rnexecutorch::RnExecutorchErrorCode::TokenizerError,

diff --git a/packages/react-native-executorch/common/runner/util.h b/packages/react-native-executorch/common/runner/util.h
@@ -10,10 +10,12 @@
 #include "constants.h"
 #include "text_prefiller.h"
 #include <cctype>
+#include <executorch/extension/module/module.h>
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/platform/compiler.h>
 #include <stdio.h>
 #include <time.h>
+#include <vector>
 #if defined(__linux__) || defined(__ANDROID__) || defined(__unix__)
 #include <sys/resource.h>
 #endif
@@ -112,8 +114,16 @@ populate_start_pos_or_cache_position(Module *module, int64_t &start_pos,
                                      const char *method_name = "forward") {
   // Get expected shape of cache position tensor, which should be the second
   // argument
-  auto method_meta = ET_UNWRAP(module->method_meta(method_name));
-  auto second_input_info = ET_UNWRAP(method_meta.input_tensor_meta(1));
+  auto method_meta_result = module->method_meta(method_name);
+  if (!method_meta_result.ok()) {
+    return method_meta_result.error();
+  }
+  auto method_meta = std::move(*method_meta_result);
+  auto second_input_info_result = method_meta.input_tensor_meta(1);
+  if (!second_input_info_result.ok()) {
+    return second_input_info_result.error();
+  }
+  auto second_input_info = std::move(*second_input_info_result);
   auto second_input_sizes = second_input_info.sizes();
   auto numel = second_input_sizes[0];
 
@@ -136,6 +146,31 @@ populate_start_pos_or_cache_position(Module *module, int64_t &start_pos,
   }
 }
 
+/**
+ * Helper function to convert a float tensor to bfloat16.
+ * Creates a new tensor with bfloat16 dtype and copies/converts the data.
+ */
+inline ::executorch::runtime::Result<::executorch::extension::TensorPtr>
+convert_to_bfloat16(const ::executorch::extension::TensorPtr &src_tensor) {
+  ET_CHECK_OR_RETURN_ERROR(
+      src_tensor->scalar_type() == ::executorch::aten::ScalarType::Float,
+      InvalidArgument,
+      "BFloat16 conversion only supported from Float source data");
+
+  const auto num_elements = static_cast<size_t>(src_tensor->numel());
+  const float *float_data = src_tensor->const_data_ptr<float>();
+
+  auto bf16_tensor = ::executorch::extension::empty_like(
+      src_tensor, ::executorch::aten::ScalarType::BFloat16);
+  auto *bf16_data =
+      bf16_tensor->mutable_data_ptr<::executorch::aten::BFloat16>();
+  for (size_t i = 0; i < num_elements; ++i) {
+    bf16_data[i] = ::executorch::aten::BFloat16(float_data[i]);
+  }
+
+  return bf16_tensor;
+}
+
 } // namespace llm
 } // namespace extension
 } // namespace executorch

diff --git a/...es/react-native-executorch/third-party/android/libs/executorch/arm64-v8a/libexecutorch.so b/...es/react-native-executorch/third-party/android/libs/executorch/arm64-v8a/libexecutorch.so
diff --git a/packages/react-native-executorch/third-party/android/libs/executorch/x86_64/libexecutorch.so b/packages/react-native-executorch/third-party/android/libs/executorch/x86_64/libexecutorch.so