LLM: plugin module - initial commit

Chris Warren-Smith · Chris Warren-Smith · commit 3ae5e0487a23 · 2025-12-22T11:53:56.000+10:30
diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
@@ -33,6 +33,7 @@ set(GGML_BUILD_TESTS OFF CACHE BOOL "" FORCE)
 set(GGML_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
 
 # CPU-only flags
+set(GGML_OPENMP OFF CACHE BOOL "" FORCE)
 set(GGML_CUDA OFF CACHE BOOL "" FORCE)
 set(GGML_METAL OFF CACHE BOOL "" FORCE)
 set(GGML_OPENCL OFF CACHE BOOL "" FORCE)
@@ -114,6 +115,9 @@ set_target_properties(llm_test PROPERTIES
 # Android native library
 # ------------------------------------------------------------------
 if (ANDROID)
+  set(GGML_LLAMAFILE OFF CACHE BOOL "" FORCE)
+  set(GGML_BLAS OFF CACHE BOOL "" FORCE)
+
   # CMake sets ANDROID when using the Android toolchain
   # Re‑use the same source files for the Android .so
   add_library(llm_android SHARED
diff --git a/llama/llama-sb.cpp b/llama/llama-sb.cpp
@@ -15,7 +15,17 @@ Llama::Llama() :
   _sampler(nullptr),
   _vocab(nullptr),
   _temperature(0),
-  _n_ctx(0) {
+  _top_k(0),
+  _top_p(1.0f),
+  _min_p(0.0f),
+  _max_tokens(150),
+  _log_level(GGML_LOG_LEVEL_NONE) {
+  llama_log_set([](enum ggml_log_level level, const char * text, void *user_data) {
+    Llama *llama = (Llama *)user_data;
+    if (level > llama->_log_level) {
+      fprintf(stderr, "LLAMA: %s", text);
+    }
+  }, this);
 }
 
 Llama::~Llama() {
@@ -42,62 +52,54 @@ const string Llama::build_chat_prompt(const string &user_msg) {
   return _chat_prompt;
 }
 
-bool Llama::construct(string model_path, int n_ctx, bool disable_log) {
-  if (disable_log) {
-    // only print errors
-    llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) {
-      if (level >= GGML_LOG_LEVEL_ERROR && text[0] != '.' && text[0] != '\n') {
-        fprintf(stderr, "%s", text);
-      }
-    }, nullptr);
-  }
-
+bool Llama::construct(string model_path, int n_ctx, int n_batch) {
   ggml_backend_load_all();
 
   llama_model_params mparams = llama_model_default_params();
-  mparams.n_gpu_layers = 99;
+  mparams.n_gpu_layers = 0;
 
   _model = llama_model_load_from_file(model_path.c_str(), mparams);
   if (!_model) {
     _last_error = "failed to load model";
   } else {
     llama_context_params cparams = llama_context_default_params();
     cparams.n_ctx   = n_ctx;
-    cparams.n_batch = n_ctx;
+    cparams.n_batch = n_batch;
     cparams.no_perf = true;
-
     _ctx = llama_init_from_model(_model, cparams);
     if (!_ctx) {
       _last_error = "failed to create context";
     } else {
       _vocab = llama_model_get_vocab(_model);
+
+      auto sparams = llama_sampler_chain_default_params();
+      sparams.no_perf = false;
+      _sampler = llama_sampler_chain_init(sparams);
     }
   }
   return _last_error.empty();
 }
 
-void Llama::configure_sampler(float temperature) {
-  if (temperature != _temperature || _sampler == nullptr) {
-    if (_sampler != nullptr) {
-      llama_sampler_free(_sampler);
+void Llama::configure_sampler() {
+  llama_sampler_reset(_sampler);
+  if (_temperature <= 0.0f) {
+    llama_sampler_chain_add(_sampler, llama_sampler_init_greedy());
+  } else {
+    llama_sampler_chain_add(_sampler, llama_sampler_init_temp(_temperature));
+    if (_top_k > 0) {
+      llama_sampler_chain_add(_sampler, llama_sampler_init_top_k(_top_k));
     }
-    auto sparams = llama_sampler_chain_default_params();
-    sparams.no_perf = false;
-    _sampler = llama_sampler_chain_init(sparams);
-    _temperature = temperature;
-
-    //  llama_sampler_chain_reset(sampler);
-    if (temperature <= 0.0f) {
-      llama_sampler_chain_add(_sampler, llama_sampler_init_greedy());
-    } else {
-      llama_sampler_chain_add(_sampler, llama_sampler_init_min_p(0.05f, 1));
-      llama_sampler_chain_add(_sampler, llama_sampler_init_temp(temperature));
-      llama_sampler_chain_add(_sampler, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+    if (_top_p < 1.0f) {
+      llama_sampler_chain_add(_sampler, llama_sampler_init_top_p(_top_p, 1));
+    }
+    if (_min_p > 0.0f) {
+      llama_sampler_chain_add(_sampler, llama_sampler_init_min_p(_min_p, 1));
     }
+    llama_sampler_chain_add(_sampler, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
   }
 }
 
-string Llama::generate(const string &prompt, int max_tokens, float temperature) {
+string Llama::generate(const string &prompt) {
   string out;
 
   // find the number of tokens in the prompt
@@ -111,7 +113,7 @@ string Llama::generate(const string &prompt, int max_tokens, float temperature)
   }
 
   // initialize the sampler
-  configure_sampler(temperature);
+  configure_sampler();
 
   // prepare a batch for the prompt
   llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
@@ -129,7 +131,7 @@ string Llama::generate(const string &prompt, int max_tokens, float temperature)
     batch = llama_batch_get_one(&decoder_start_token_id, 1);
   }
 
-  for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + max_tokens;) {
+  for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + _max_tokens;) {
     // evaluate the current batch with the transformer model
     if (llama_decode(_ctx, batch)) {
       _last_error = "failed to eval";
diff --git a/llama/llama-sb.h b/llama/llama-sb.h
@@ -16,15 +16,32 @@ struct Llama {
   explicit Llama();
   ~Llama();
 
+  // init
+  bool construct(string model_path, int n_ctx, int n_batch);
+
+  // generation
+  string generate(const string &prompt);
+
+  // generation parameters
+  void set_max_tokens(int max_tokens) { _max_tokens = max_tokens; }
+  void set_min_p(float min_p) { _min_p = min_p; }
+  void set_temperature(float temperature) { _temperature = temperature; }
+  void set_top_k(int top_k) { _top_k = top_k; }
+  void set_top_p(float top_p) { _top_p = top_p; }
+
+  // messages
   void append_response(const string &response);
+  void append_user_message(const string &user_msg);
+  const string& get_chat_history() const;
   const string build_chat_prompt(const string &user_msg);
-  bool construct(string model_path, int n_ctx, bool disable_log);
-  string generate(const string &prompt, int max_tokens, float temperature);
+
+  // error handling
   const char *last_error() { return _last_error.c_str(); }
+  void set_log_level(int level) { _log_level = level; }
   void reset();
 
   private:
-  void configure_sampler(float temperature);
+  void configure_sampler();
 
   llama_model *_model;
   llama_context *_ctx;
@@ -33,5 +50,9 @@ struct Llama {
   string _chat_prompt;
   string _last_error;
   float _temperature;
-  int _n_ctx;
+  float _top_p;
+  float _min_p;
+  int _top_k;
+  int _max_tokens;
+  int _log_level;
 };
diff --git a/llama/main.cpp b/llama/main.cpp
@@ -50,26 +50,111 @@ static string expand_path(const char *path) {
   return result;
 }
 
+//
+// llama.set_max_tokens(50)
+//
+static int cmd_llama_set_max_tokens(var_s *self, int argc, slib_par_t *arg, var_s *retval) {
+  int result = 0;
+  if (argc != 1) {
+    error(retval, "llama.set_max_tokens", 1, 1);
+  } else {
+    int id = get_class_id(self, retval);
+    if (id != -1) {
+      Llama &llama = g_map.at(id);
+      llama.set_max_tokens(get_param_int(argc, arg, 0, 0));
+      result = 1;
+    }
+  }
+  return result;
+}
+
+//
+// llama.set_min_p(0.5)
+//
+static int cmd_llama_set_min_p(var_s *self, int argc, slib_par_t *arg, var_s *retval) {
+  int result = 0;
+  if (argc != 1) {
+    error(retval, "llama.set_min_p", 1, 1);
+  } else {
+    int id = get_class_id(self, retval);
+    if (id != -1) {
+      Llama &llama = g_map.at(id);
+      llama.set_min_p(get_param_num(argc, arg, 0, 0));
+      result = 1;
+    }
+  }
+  return result;
+}
+
+//
+// llama.set_temperature(0.8)
+//
+static int cmd_llama_set_temperature(var_s *self, int argc, slib_par_t *arg, var_s *retval) {
+  int result = 0;
+  if (argc != 1) {
+    error(retval, "llama.set_temperature", 1, 1);
+  } else {
+    int id = get_class_id(self, retval);
+    if (id != -1) {
+      Llama &llama = g_map.at(id);
+      llama.set_temperature(get_param_num(argc, arg, 0, 0));
+      result = 1;
+    }
+  }
+  return result;
+}
+
+//
+// llama.set_set_top_k(10.0)
+//
+static int cmd_llama_set_top_k(var_s *self, int argc, slib_par_t *arg, var_s *retval) {
+  int result = 0;
+  if (argc != 1) {
+    error(retval, "llama.set_top_k", 1, 1);
+  } else {
+    int id = get_class_id(self, retval);
+    if (id != -1) {
+      Llama &llama = g_map.at(id);
+      llama.set_top_k(get_param_int(argc, arg, 0, 0));
+      result = 1;
+    }
+  }
+  return result;
+}
+
+static int cmd_llama_set_top_p(var_s *self, int argc, slib_par_t *arg, var_s *retval) {
+  int result = 0;
+  if (argc != 1) {
+    error(retval, "llama.set_top_p", 1, 1);
+  } else {
+    int id = get_class_id(self, retval);
+    if (id != -1) {
+      Llama &llama = g_map.at(id);
+      llama.set_top_p(get_param_num(argc, arg, 0, 0));
+      result = 1;
+    }
+  }
+  return result;
+}
+
 //
 // print llama.chat("Hello")
 //
 static int cmd_llama_chat(var_s *self, int argc, slib_par_t *arg, var_s *retval) {
   int result = 0;
-  if (argc < 1) {
-    error(retval, "llama.chat", 1, 3);
+  if (argc != 1) {
+    error(retval, "llama.chat", 1, 1);
   } else {
     int id = get_class_id(self, retval);
     if (id != -1) {
       Llama &llama = g_map.at(id);
       auto prompt = get_param_str(argc, arg, 0, "");
-      int max_tokens = get_param_int(argc, arg, 1, 32);
-      var_num_t temperature = get_param_num(argc, arg, 2, 0.8f);
 
       // build accumulated prompt
       string updated_prompt = llama.build_chat_prompt(prompt);
 
       // run generation WITHOUT clearing cache
-      string response = llama.generate(updated_prompt, max_tokens, temperature);
+      string response = llama.generate(updated_prompt);
 
       // append assistant reply to history
       llama.append_response(response);
@@ -100,20 +185,18 @@ static int cmd_llama_reset(var_s *self, int argc, slib_par_t *arg, var_s *retval
 }
 
 //
-// print llama.generate("please generate as simple program in BASIC to draw a cat", 1024, 0.8)
+// print llama.generate("please generate as simple program in BASIC to draw a cat")
 //
 static int cmd_llama_generate(var_s *self, int argc, slib_par_t *arg, var_s *retval) {
   int result = 0;
-  if (argc < 1) {
-    error(retval, "llama.generate", 1, 3);
+  if (argc != 1) {
+    error(retval, "llama.generate", 1, 1);
   } else {
     int id = get_class_id(self, retval);
     if (id != -1) {
       Llama &llama = g_map.at(id);
       auto prompt = get_param_str(argc, arg, 0, "");
-      int max_tokens = get_param_int(argc, arg, 1, 32);
-      var_num_t temperature = get_param_num(argc, arg, 2, 0.8f);
-      string response = llama.generate(prompt, max_tokens, temperature);
+      string response = llama.generate(prompt);
       v_setstr(retval, response.c_str());
       result = 1;
     }
@@ -124,12 +207,19 @@ static int cmd_llama_generate(var_s *self, int argc, slib_par_t *arg, var_s *ret
 static int cmd_create_llama(int argc, slib_par_t *params, var_t *retval) {
   int result;
   auto model = expand_path(get_param_str(argc, params, 0, ""));
-  int n_ctx = get_param_int(argc, params, 0, 2048);
-  int disable_log = get_param_int(argc, params, 1, 1);
+  auto n_ctx = get_param_int(argc, params, 0, 2048);
+  auto n_batch = get_param_int(argc, params, 1, 1024);
+  auto temperature = get_param_num(argc, params, 2, 0.25);
   int id = ++g_nextId;
   Llama &llama = g_map[id];
-  if (llama.construct(model, n_ctx, disable_log)) {
+  if (llama.construct(model, n_ctx, n_batch)) {
+    llama.set_temperature(temperature);
     map_init_id(retval, id, CLASS_ID);
+    v_create_callback(retval, "set_max_tokens", cmd_llama_set_max_tokens);
+    v_create_callback(retval, "set_min_p", cmd_llama_set_min_p);
+    v_create_callback(retval, "set_temperature", cmd_llama_set_temperature);
+    v_create_callback(retval, "set_top_k", cmd_llama_set_top_k);
+    v_create_callback(retval, "set_top_p", cmd_llama_set_top_p);
     v_create_callback(retval, "chat", cmd_llama_chat);
     v_create_callback(retval, "generate", cmd_llama_generate);
     v_create_callback(retval, "reset", cmd_llama_reset);
diff --git a/llama/test_main.cpp b/llama/test_main.cpp
@@ -56,8 +56,8 @@ int main(int argc, char ** argv) {
   }
 
   Llama llama;
-  if (llama.construct(model_path, 1024, true)) {
-    string out = llama. generate(prompt, n_predict, 0.8f);
+  if (llama.construct(model_path, 1024, 1024)) {
+    string out = llama.generate(prompt);
     printf("\033[33m");
     printf(out.c_str());
     printf("\n\033[0m");