Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/llm/app/voice_chat/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ function VoiceChatScreen() {
return !llm.isReady || !speechToText.isReady ? (
<Spinner
visible={!llm.isReady || !speechToText.isReady}
textContent={`Loading the model ${(llm.downloadProgress * 100).toFixed(0)} %\nLoading the speech model ${(speechToText.downloadProgress * 100).toFixed(0)} %`}
textContent={`Loading the LLM model ${(llm.downloadProgress * 100).toFixed(0)} %\nLoading the speech model ${(speechToText.downloadProgress * 100).toFixed(0)} %`}
/>
) : (
<TouchableWithoutFeedback onPress={Keyboard.dismiss}>
Expand Down
Binary file modified packages/react-native-executorch/android/libs/classes.jar
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

namespace rnexecutorch {
using namespace facebook;
using namespace executorch::extension::constants;
using namespace executorch::extension::llm;

TokenizerModule::TokenizerModule(
std::string source, std::shared_ptr<react::CallInvoker> callInvoker)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class BaseModel {
// (unnecessary copies instead of working on JS memory). In this case
// CallInvoker can be used to get jsi::Runtime, and use it in a safe manner.
std::shared_ptr<react::CallInvoker> callInvoker;
std::unique_ptr<executorch::extension::Module> module_;
std::unique_ptr<Module> module_;

std::size_t memorySizeLowerBound{0};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ TEST(S2TTranscribeTests, TranscribeReturnsValidChars) {
auto result = model.transcribe(audio, "en", true);
ASSERT_EQ(result.language, "en");
EXPECT_GE(result.duration, 20.0f);
ASSERT_EQ(result.task, "transcription");
ASSERT_EQ(result.task, "transcribe");
ASSERT_FALSE(result.segments.empty());
ASSERT_FALSE(result.text.empty());
for (char c : result.text) {
Expand Down
5 changes: 3 additions & 2 deletions packages/react-native-executorch/common/runner/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
*/
#pragma once
// constants for LLM runtime
namespace executorch::extension::constants {
namespace executorch::extension::llm {

// Runtime metadata key constants
inline constexpr auto kEnableDynamicShape = "enable_dynamic_shape";
Expand All @@ -27,4 +27,5 @@ inline constexpr auto kTextModelMethod = "text_decoder";

inline constexpr auto numOfAddedBoSTokens = 0;
inline constexpr auto numOfAddedEoSTokens = 0;
} // namespace executorch::extension::constants

} // namespace executorch::extension::llm
4 changes: 4 additions & 0 deletions packages/react-native-executorch/common/runner/irunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ struct GenerationConfig {

// Use KV_CACHE implementation (if implemented) or not
bool enable_kv_cache = true;

// Number of eos and bos to add to the prompt
int32_t num_bos = 0;
int32_t num_eos = 0;
};

// Base interface for LLM runners
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
#pragma once

// This list should be very conservative since most kernel .cpp files will
// include these and depend on their transitive deps. Only add a header if 99%
// of kernels would have included it anyway.
// include these and depend on their transitive deps. Only add a header if
// 99% of kernels would have included it anyway.
#include <executorch/runtime/core/exec_aten/exec_aten.h> // IWYU pragma: export
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h> // IWYU pragma: export
#include <executorch/runtime/core/exec_aten/util/tensor_util.h> // IWYU pragma: export
Expand Down
2 changes: 1 addition & 1 deletion packages/react-native-executorch/common/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

namespace example {

using namespace executorch::extension::constants;
using namespace executorch::extension::llm;
using ::executorch::extension::Module;
using ::executorch::runtime::Error;
using ::executorch::runtime::Result;
Expand Down
8 changes: 8 additions & 0 deletions packages/react-native-executorch/common/runner/sampler.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ namespace extension {
namespace llm {
// A simple llama2 sampler.

inline constexpr auto kTopp = 0.9f;

template <typename T> struct ProbIndex {
T prob;
int32_t index;
Expand Down Expand Up @@ -65,3 +67,9 @@ using ::executorch::extension::llm::ProbIndex;
using ::executorch::extension::llm::Sampler;
} // namespace executor
} // namespace torch

namespace executorch::llm {
// TODO(T197294990): Remove these deprecated aliases once all users have moved
// to the new `::executorch::extension::llm` namespaces.
using ::executorch::extension::llm::kTopp;
} // namespace executorch::llm
61 changes: 59 additions & 2 deletions packages/react-native-executorch/common/runner/stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "util.h"
#include <cinttypes>
#include <executorch/runtime/platform/log.h>
#include <limits>
#include <sstream>
#include <string>

Expand Down Expand Up @@ -44,11 +45,19 @@ struct Stats {
// inference_end_ms: End of inference/generation.
long inference_end_ms;
// Keep a running total of the time spent in sampling.
long aggregate_sampling_time_ms;
long aggregate_sampling_time_ms = 0;
// Token count from prompt
int64_t num_prompt_tokens;
// Token count from generated (total - prompt)
int64_t num_generated_tokens;
// GPU memory stats (optional; may be zero if not available)
// GPU memory stats (optional). Use sentinel UINT64_MAX / -1.0 to indicate
// "not available".
uint64_t gpu_total_bytes = std::numeric_limits<uint64_t>::max();
uint64_t gpu_free_before_load_bytes = std::numeric_limits<uint64_t>::max();
uint64_t gpu_free_after_load_bytes = std::numeric_limits<uint64_t>::max();
uint64_t gpu_free_after_generate_bytes = std::numeric_limits<uint64_t>::max();
double gpu_peak_usage_mb = -1.0;
inline void on_sampling_begin() {
aggregate_sampling_timer_start_timestamp = time_in_ms();
}
Expand All @@ -75,6 +84,11 @@ struct Stats {
aggregate_sampling_time_ms = 0;
num_prompt_tokens = 0;
num_generated_tokens = 0;
gpu_total_bytes = std::numeric_limits<uint64_t>::max();
gpu_free_before_load_bytes = std::numeric_limits<uint64_t>::max();
gpu_free_after_load_bytes = std::numeric_limits<uint64_t>::max();
gpu_free_after_generate_bytes = std::numeric_limits<uint64_t>::max();
gpu_peak_usage_mb = -1.0;
aggregate_sampling_timer_start_timestamp = 0;
}

Expand All @@ -93,7 +107,29 @@ inline std::string stats_to_json_string(const Stats &stats) {
<< "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << ","
<< "\"first_token_ms\":" << stats.first_token_ms << ","
<< "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms
<< "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
<< ",";
// Only include GPU fields in the JSON if gpu_total_bytes is valid (not
// equal to sentinel -1)
if (stats.gpu_total_bytes != static_cast<uint64_t>(-1)) {
ss << "\"gpu_total_bytes\":" << stats.gpu_total_bytes;
if (stats.gpu_free_before_load_bytes != static_cast<uint64_t>(-1)) {
ss << ",\"gpu_free_before_load_bytes\":"
<< stats.gpu_free_before_load_bytes;
}
if (stats.gpu_free_after_load_bytes != static_cast<uint64_t>(-1)) {
ss << ",\"gpu_free_after_load_bytes\":"
<< stats.gpu_free_after_load_bytes;
}
if (stats.gpu_free_after_generate_bytes != static_cast<uint64_t>(-1)) {
ss << ",\"gpu_free_after_generate_bytes\":"
<< stats.gpu_free_after_generate_bytes;
}
if (stats.gpu_peak_usage_mb >= 0.0) {
ss << ",\"gpu_peak_usage_mb\":" << stats.gpu_peak_usage_mb;
}
ss << ",";
}
ss << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
<< stats.SCALING_FACTOR_UNITS_PER_SECOND << "}";
return ss.str();
}
Expand Down Expand Up @@ -145,6 +181,27 @@ inline void print_report(const Stats &stats) {
stats.num_prompt_tokens + stats.num_generated_tokens,
(double)stats.aggregate_sampling_time_ms /
stats.SCALING_FACTOR_UNITS_PER_SECOND);

// GPU memory reporting (only meaningful if GPU fields were populated)
if (stats.gpu_total_bytes != static_cast<uint64_t>(-1)) {
ET_LOG(Info, "\tGPU total memory: %.2f MB",
stats.gpu_total_bytes / 1024.0 / 1024.0);
if (stats.gpu_free_before_load_bytes != static_cast<uint64_t>(-1)) {
ET_LOG(Info, "\tGPU free before load: %.2f MB",
stats.gpu_free_before_load_bytes / 1024.0 / 1024.0);
}
if (stats.gpu_free_after_load_bytes != static_cast<uint64_t>(-1)) {
ET_LOG(Info, "\tGPU free after load: %.2f MB",
stats.gpu_free_after_load_bytes / 1024.0 / 1024.0);
}
if (stats.gpu_free_after_generate_bytes != static_cast<uint64_t>(-1)) {
ET_LOG(Info, "\tGPU free after generate: %.2f MB",
stats.gpu_free_after_generate_bytes / 1024.0 / 1024.0);
}
if (stats.gpu_peak_usage_mb >= 0.0) {
ET_LOG(Info, "\tGPU peak usage: %.2f MB", stats.gpu_peak_usage_mb);
}
}
}

} // namespace llm
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,23 @@ TextDecoderRunner::TextDecoderRunner(Module *module, IOManager *io_manager,
::executorch::runtime::Result<executorch::aten::Tensor>
TextDecoderRunner::step(TensorPtr &tokens, int64_t start_pos) {
// ET_LOG(Info, "Input token %" PRIu64, input_token);
auto method_meta = ET_UNWRAP(module_->method_meta("forward"));
auto method_meta_result = module_->method_meta("forward");
if (!method_meta_result.ok()) {
return method_meta_result.error();
}
auto method_meta = std::move(*method_meta_result);
// If only 1 input, we are not using kv cache
bool use_kv_cache = method_meta.num_inputs() > 1;

std::vector<int64_t> cache_positions;

if (use_kv_cache) {
auto start_pos_tensor = ET_UNWRAP(populate_start_pos_or_cache_position(
module_, start_pos, cache_positions, tokens->numel(), "forward"));
auto start_pos_tensor_result = populate_start_pos_or_cache_position(
module_, start_pos, cache_positions, tokens->numel(), "forward");
if (!start_pos_tensor_result.ok()) {
return start_pos_tensor_result.error();
}
auto start_pos_tensor = std::move(*start_pos_tensor_result);

std::vector<runtime::EValue> inputs;
auto inputs_res = io_manager_->prepare_decode(tokens, start_pos_tensor);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
}

// Check if we need to chunk the prompt tokens
int32_t num_prompt_tokens = prompt_tokens.size();

Check warning on line 35 in packages/react-native-executorch/common/runner/text_prefiller.cpp

View workflow job for this annotation

GitHub Actions / build

implicit conversion loses integer precision: 'size_type' (aka 'unsigned long') to 'int32_t' (aka 'int') [-Wshorten-64-to-32]

// If prompt tokens exceed max_seq_len_, we need to chunk them
if (num_prompt_tokens > max_seq_len_) {
Expand All @@ -41,7 +41,7 @@

while (num_tokens_to_process < num_prompt_tokens) {
auto num_tokens_to_prefill_with = std::min<int>(
num_prompt_tokens - num_tokens_to_process, max_seq_len_);

Check warning on line 44 in packages/react-native-executorch/common/runner/text_prefiller.cpp

View workflow job for this annotation

GitHub Actions / build

implicit conversion loses integer precision: 'int64_t' (aka 'long long') to 'const int' [-Wshorten-64-to-32]

std::vector<uint64_t> prompt_tokens_to_process(
num_tokens_to_prefill_with);
Expand Down Expand Up @@ -98,8 +98,11 @@

// run the first token and get back logits tensor. Assuming the first token
// is bos so don't callback.
auto logits_tensor =
ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos));
auto logits_result = text_decoder_runner_->step(tokens, start_pos);
if (!logits_result.ok()) {
return logits_result.error();
}
auto logits_tensor = std::move(*logits_result);

pos += 1; // start the loop from index 1
start_pos += 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ class TextTokenGenerator {
* @param start_pos The start position of the new tokens, based on how many
* prompt tokens is prefilled.
* @param max_new_tokens Maximum number of new tokens to generate.
* @param temperature controls the randomness of predictions by scaling the
* logits before applying softmax. A higher temperature results in more
* random predictions, while a lower temperature results in more deterministic
* predictions.
* @param token_callback what to do after a token is generated.
* @return how many tokens are generated.
*/
Expand Down Expand Up @@ -113,7 +117,8 @@ class TextTokenGenerator {
// We pass false, as we want don't want to skip special tokens e.g.
// <think>

auto decodeResult = tokenizer_->decode(token_cache, false);
auto decodeResult =
tokenizer_->decode(token_cache, false); // NOTE: difference
if (!decodeResult.ok()) {
throw rnexecutorch::RnExecutorchError(
rnexecutorch::RnExecutorchErrorCode::TokenizerError,
Expand Down
39 changes: 37 additions & 2 deletions packages/react-native-executorch/common/runner/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
#include "constants.h"
#include "text_prefiller.h"
#include <cctype>
#include <executorch/extension/module/module.h>
#include <executorch/extension/tensor/tensor.h>
#include <executorch/runtime/platform/compiler.h>
#include <stdio.h>
#include <time.h>
#include <vector>
#if defined(__linux__) || defined(__ANDROID__) || defined(__unix__)
#include <sys/resource.h>
#endif
Expand Down Expand Up @@ -112,8 +114,16 @@ populate_start_pos_or_cache_position(Module *module, int64_t &start_pos,
const char *method_name = "forward") {
// Get expected shape of cache position tensor, which should be the second
// argument
auto method_meta = ET_UNWRAP(module->method_meta(method_name));
auto second_input_info = ET_UNWRAP(method_meta.input_tensor_meta(1));
auto method_meta_result = module->method_meta(method_name);
if (!method_meta_result.ok()) {
return method_meta_result.error();
}
auto method_meta = std::move(*method_meta_result);
auto second_input_info_result = method_meta.input_tensor_meta(1);
if (!second_input_info_result.ok()) {
return second_input_info_result.error();
}
auto second_input_info = std::move(*second_input_info_result);
auto second_input_sizes = second_input_info.sizes();
auto numel = second_input_sizes[0];

Expand All @@ -136,6 +146,31 @@ populate_start_pos_or_cache_position(Module *module, int64_t &start_pos,
}
}

/**
* Helper function to convert a float tensor to bfloat16.
* Creates a new tensor with bfloat16 dtype and copies/converts the data.
*/
inline ::executorch::runtime::Result<::executorch::extension::TensorPtr>
convert_to_bfloat16(const ::executorch::extension::TensorPtr &src_tensor) {
ET_CHECK_OR_RETURN_ERROR(
src_tensor->scalar_type() == ::executorch::aten::ScalarType::Float,
InvalidArgument,
"BFloat16 conversion only supported from Float source data");

const auto num_elements = static_cast<size_t>(src_tensor->numel());
const float *float_data = src_tensor->const_data_ptr<float>();

auto bf16_tensor = ::executorch::extension::empty_like(
src_tensor, ::executorch::aten::ScalarType::BFloat16);
auto *bf16_data =
bf16_tensor->mutable_data_ptr<::executorch::aten::BFloat16>();
for (size_t i = 0; i < num_elements; ++i) {
bf16_data[i] = ::executorch::aten::BFloat16(float_data[i]);
}

return bf16_tensor;
}

} // namespace llm
} // namespace extension
} // namespace executorch
Expand Down
Binary file not shown.
Binary file not shown.
Loading
Loading