diff --git a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua index 6d4cc501c..40a682b2d 100644 --- a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua +++ b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua @@ -1819,6 +1819,9 @@ UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3403290862"] = "The selec -- Select a provider first UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3654197869"] = "Select a provider first" +-- Estimated amount of tokens: +UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T377990776"] = "Estimated amount of tokens:" + -- Start new chat in workspace '{0}' UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3928697643"] = "Start new chat in workspace '{0}'" @@ -3553,6 +3556,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2331453405"] = "(O -- Add UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2646845972"] = "Add" +-- Selected file path for the custom tokenizer +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T278585345"] = "Selected file path for the custom tokenizer" + -- No models loaded or available. UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2810182573"] = "No models loaded or available." @@ -3562,6 +3568,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2842060373"] = "In -- Currently, we cannot query the embedding models for the selected provider and/or host. Therefore, please enter the model name manually. UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T290547799"] = "Currently, we cannot query the embedding models for the selected provider and/or host. Therefore, please enter the model name manually." +-- Choose a custom tokenizer here +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T3787466119"] = "Choose a custom tokenizer here" + -- Model selection UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T416738168"] = "Model selection" @@ -5398,6 +5407,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1019424746"] = "Startup log file -- Browse AI Studio's source code on GitHub — we welcome your contributions. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1107156991"] = "Browse AI Studio's source code on GitHub — we welcome your contributions." +-- The Tokenizer library serves as the base framework for integrating the DeepSeek tokenizer. +UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1132433749"] = "The Tokenizer library serves as the base framework for integrating the DeepSeek tokenizer." + -- ID mismatch: the plugin ID differs from the enterprise configuration ID. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1137744461"] = "ID mismatch: the plugin ID differs from the enterprise configuration ID." @@ -5638,6 +5650,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T566998575"] = "This is a library -- Used .NET SDK UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T585329785"] = "Used .NET SDK" +-- We use the DeepSeek Tokenizer to estimate the number of tokens an input will generate. +UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T591393704"] = "We use the DeepSeek Tokenizer to estimate the number of tokens an input will generate." + -- This library is used to manage sidecar processes and to ensure that stale or zombie sidecars are detected and terminated. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T633932150"] = "This library is used to manage sidecar processes and to ensure that stale or zombie sidecars are detected and terminated." @@ -6664,29 +6679,80 @@ UI_TEXT_CONTENT["AISTUDIO::TOOLS::RAG::RAGPROCESSES::AISRCSELWITHRETCTXVAL::T304 -- AI source selection with AI retrieval context validation UI_TEXT_CONTENT["AISTUDIO::TOOLS::RAG::RAGPROCESSES::AISRCSELWITHRETCTXVAL::T3775725978"] = "AI source selection with AI retrieval context validation" --- Executable Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T2217313358"] = "Executable Files" +-- Text +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1041509726"] = "Text" + +-- Office Files +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1063218378"] = "Office Files" + +-- Executable +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1364437037"] = "Executable" + +-- Mail +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1399880782"] = "Mail" + +-- Source like +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1487238587"] = "Source like" + +-- Image +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1494001562"] = "Image" + +-- Video +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1533528076"] = "Video" + +-- Source Code +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1569048941"] = "Source Code" + +-- Config +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1779622119"] = "Config" + +-- Audio +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T2291602489"] = "Audio" + +-- Custom +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T2502277006"] = "Custom" + +-- Media +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T3507473059"] = "Media" + +-- Source like prefix +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T378481461"] = "Source like prefix" + +-- Document +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T4165204724"] = "Document" + +-- Text +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1041509726"] = "Text" + +-- Office Files +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1063218378"] = "Office Files" + +-- Executable +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1364437037"] = "Executable" + +-- Image +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1494001562"] = "Image" --- All Source Code Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T2460199369"] = "All Source Code Files" +-- Video +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1533528076"] = "Video" --- All Audio Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T2575722901"] = "All Audio Files" +-- Source Code +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1569048941"] = "Source Code" --- All Video Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T2850789856"] = "All Video Files" +-- Config +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1779622119"] = "Config" --- PDF Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T3108466742"] = "PDF Files" +-- Audio +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T2291602489"] = "Audio" --- All Image Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T4086723714"] = "All Image Files" +-- Custom +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T2502277006"] = "Custom" --- Text Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T639143005"] = "Text Files" +-- Media +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T3507473059"] = "Media" --- All Office Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T709668067"] = "All Office Files" +-- Document +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T4165204724"] = "Document" -- Pandoc Installation UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::PANDOCAVAILABILITYSERVICE::T185447014"] = "Pandoc Installation" diff --git a/app/MindWork AI Studio/Chat/FileAttachment.cs b/app/MindWork AI Studio/Chat/FileAttachment.cs index f364ed8fa..1ce4f1d98 100644 --- a/app/MindWork AI Studio/Chat/FileAttachment.cs +++ b/app/MindWork AI Studio/Chat/FileAttachment.cs @@ -58,11 +58,14 @@ public record FileAttachment(FileAttachmentType Type, string FileName, string Fi /// extracting the filename, and reading the file size. /// /// The full path to the file. + /// Optional: The allowed file types. /// A FileAttachment instance with populated properties. - public static FileAttachment FromPath(string filePath) + public static FileAttachment FromPath(string filePath, FileType[]? allowedTypes=null) { var fileName = Path.GetFileName(filePath); var fileSize = File.Exists(filePath) ? new FileInfo(filePath).Length : 0; + if (allowedTypes != null && !IsAllowed(filePath, allowedTypes)) + return new FileAttachment(FileAttachmentType.FORBIDDEN, fileName, filePath, fileSize); var type = DetermineFileType(filePath); return type switch @@ -76,34 +79,24 @@ public static FileAttachment FromPath(string filePath) /// /// Determines the file attachment type based on the file extension. - /// Uses centrally defined file type filters from . + /// Uses centrally defined file type filters from . /// /// The file path to analyze. /// The corresponding FileAttachmentType. private static FileAttachmentType DetermineFileType(string filePath) { - var extension = Path.GetExtension(filePath).TrimStart('.').ToLowerInvariant(); - - if (FileTypeFilter.Executables.FilterExtensions.Contains(extension)) + if (FileTypes.IsAllowedPath(filePath, FileTypes.EXECUTABLES)) return FileAttachmentType.FORBIDDEN; - // Check if it's an image file: - if (FileTypeFilter.AllImages.FilterExtensions.Contains(extension)) + if (FileTypes.IsAllowedPath(filePath, FileTypes.IMAGE)) return FileAttachmentType.IMAGE; + } - // Check if it's an audio file: - if (FileTypeFilter.AllAudio.FilterExtensions.Contains(extension)) + if (FileTypes.IsAllowedPath(filePath, FileTypes.AUDIO)) return FileAttachmentType.AUDIO; - // Check if it's an allowed document file (PDF, Text, or Office): - if (FileTypeFilter.PDF.FilterExtensions.Contains(extension) || - FileTypeFilter.Text.FilterExtensions.Contains(extension) || - FileTypeFilter.AllOffice.FilterExtensions.Contains(extension) || - FileTypeFilter.AllSourceCode.FilterExtensions.Contains(extension) || - FileTypeFilter.IsAllowedSourceLikeFileName(filePath)) - return FileAttachmentType.DOCUMENT; - - // All other file types are forbidden: - return FileAttachmentType.FORBIDDEN; + return FileTypes.IsAllowedPath(filePath, FileTypes.DOCUMENT) + ? FileAttachmentType.DOCUMENT + : FileAttachmentType.FORBIDDEN; } -} \ No newline at end of file +} diff --git a/app/MindWork AI Studio/Components/AttachDocuments.razor.cs b/app/MindWork AI Studio/Components/AttachDocuments.razor.cs index acfc0dd2f..0608125a7 100644 --- a/app/MindWork AI Studio/Components/AttachDocuments.razor.cs +++ b/app/MindWork AI Studio/Components/AttachDocuments.razor.cs @@ -48,6 +48,9 @@ public partial class AttachDocuments : MSGComponentBase [Parameter] public bool UseSmallForm { get; set; } + [Parameter] + public FileType[]? AllowedFileTypes { get; set; } + /// /// When true, validate media file types before attaching. Default is true. That means that /// the user cannot attach unsupported media file types when the provider or model does not @@ -181,7 +184,6 @@ protected override async Task OnInitializedAsync() { if(!await FileExtensionValidation.IsExtensionValidWithNotifyAsync(FileExtensionValidation.UseCase.ATTACHING_CONTENT, path, this.ValidateMediaFileTypes, this.Provider)) continue; - this.DocumentPaths.Add(FileAttachment.FromPath(path)); } @@ -226,7 +228,7 @@ private async Task AddFilesManually() if (!await FileExtensionValidation.IsExtensionValidWithNotifyAsync(FileExtensionValidation.UseCase.ATTACHING_CONTENT, selectedFilePath, this.ValidateMediaFileTypes, this.Provider)) continue; - this.DocumentPaths.Add(FileAttachment.FromPath(selectedFilePath)); + this.DocumentPaths.Add(FileAttachment.FromPath(selectedFilePath, this.AllowedFileTypes)); } await this.DocumentPathsChanged.InvokeAsync(this.DocumentPaths); diff --git a/app/MindWork AI Studio/Components/ChatComponent.razor b/app/MindWork AI Studio/Components/ChatComponent.razor index 20bb5ec47..ced007115 100644 --- a/app/MindWork AI Studio/Components/ChatComponent.razor +++ b/app/MindWork AI Studio/Components/ChatComponent.razor @@ -34,7 +34,7 @@ - diff --git a/app/MindWork AI Studio/Components/ChatComponent.razor.cs b/app/MindWork AI Studio/Components/ChatComponent.razor.cs index f734d620f..9bfa27bda 100644 --- a/app/MindWork AI Studio/Components/ChatComponent.razor.cs +++ b/app/MindWork AI Studio/Components/ChatComponent.razor.cs @@ -3,6 +3,7 @@ using AIStudio.Provider; using AIStudio.Settings; using AIStudio.Settings.DataModel; +using AIStudio.Tools.Services; using Microsoft.AspNetCore.Components; using Microsoft.AspNetCore.Components.Web; @@ -44,6 +45,8 @@ public partial class ChatComponent : MSGComponentBase, IAsyncDisposable [Inject] private IDialogService DialogService { get; init; } = null!; + [Inject] + private RustService RustService { get; init; } = null!; [Inject] private IJSRuntime JsRuntime { get; init; } = null!; @@ -69,10 +72,12 @@ public partial class ChatComponent : MSGComponentBase, IAsyncDisposable private Guid currentChatThreadId = Guid.Empty; private CancellationTokenSource? cancellationTokenSource; private HashSet chatDocumentPaths = []; + private string tokenCount = "0"; + private string TokenCountMessage => $"{this.T("Estimated amount of tokens:")} {this.tokenCount}"; // Unfortunately, we need the input field reference to blur the focus away. Without // this, we cannot clear the input field. - private MudTextField inputField = null!; + private UserPromptComponent inputField = null!; #region Overrides of ComponentBase @@ -460,6 +465,9 @@ private async Task InputKeyEvent(KeyboardEventArgs keyEvent) // Was a modifier key pressed as well? var isModifier = keyEvent.AltKey || keyEvent.CtrlKey || keyEvent.MetaKey || keyEvent.ShiftKey; + if (isEnter) + await this.CalculateTokenCount(); + // Depending on the user's settings, might react to shortcuts: switch (this.SettingsManager.ConfigurationData.Chat.ShortcutSendBehavior) { @@ -591,6 +599,7 @@ private async Task SendMessage(bool reuseLastUserPrompt = false) this.chatDocumentPaths.Clear(); await this.inputField.BlurAsync(); + this.tokenCount = "0"; // Enable the stream state for the chat component: this.isStreaming = true; @@ -973,6 +982,20 @@ private Task EditLastBlock(IContent block) return Task.CompletedTask; } + private async Task CalculateTokenCount() + { + if (this.inputField.Value is null) + { + this.tokenCount = "0"; + return; + } + var response = await this.RustService.GetTokenCount(this.inputField.Value); + if (response is null) + return; + this.tokenCount = response.TokenCount.ToString(); + this.StateHasChanged(); + } + #region Overrides of MSGComponentBase protected override async Task ProcessIncomingMessage(ComponentBase? sendingComponent, Event triggeredEvent, T? data) where T : default diff --git a/app/MindWork AI Studio/Components/SelectFile.razor b/app/MindWork AI Studio/Components/SelectFile.razor index de3971e52..561b11c0d 100644 --- a/app/MindWork AI Studio/Components/SelectFile.razor +++ b/app/MindWork AI Studio/Components/SelectFile.razor @@ -11,6 +11,7 @@ AdornmentIcon="@Icons.Material.Filled.AttachFile" UserAttributes="@SPELLCHECK_ATTRIBUTES" Variant="Variant.Outlined" + Clearable="this.IsClearable" /> diff --git a/app/MindWork AI Studio/Components/SelectFile.razor.cs b/app/MindWork AI Studio/Components/SelectFile.razor.cs index 9caf3cd77..06826ca29 100644 --- a/app/MindWork AI Studio/Components/SelectFile.razor.cs +++ b/app/MindWork AI Studio/Components/SelectFile.razor.cs @@ -23,16 +23,19 @@ public partial class SelectFile : MSGComponentBase public string FileDialogTitle { get; set; } = "Select File"; [Parameter] - public FileTypeFilter? Filter { get; set; } + public FileTypeFilter[]? Filter { get; set; } [Parameter] public Func Validation { get; set; } = _ => null; + + [Parameter] + public bool IsClearable { get; set; } = false; [Inject] public RustService RustService { get; set; } = null!; [Inject] - protected ILogger Logger { get; init; } = null!; + protected ILogger Logger { get; init; } = null!; private static readonly Dictionary SPELLCHECK_ATTRIBUTES = new(); diff --git a/app/MindWork AI Studio/Components/UserPromptComponent.cs b/app/MindWork AI Studio/Components/UserPromptComponent.cs new file mode 100644 index 000000000..03139a525 --- /dev/null +++ b/app/MindWork AI Studio/Components/UserPromptComponent.cs @@ -0,0 +1,68 @@ +using Microsoft.AspNetCore.Components; +using Timer = System.Timers.Timer; + +namespace AIStudio.Components; + +/// +/// Debounced multi-line text input built on . +/// Keeps the base API while adding a debounce timer. +/// Callers can override any property as usual. +/// +public class UserPromptComponent : MudTextField +{ + [Parameter] + public TimeSpan DebounceTime { get; set; } = TimeSpan.FromMilliseconds(800); + + [Parameter] + public Func WhenTextChangedAsync { get; set; } = _ => Task.CompletedTask; + + private readonly Timer debounceTimer = new(); + private string text = string.Empty; + private string lastParameterText = string.Empty; + private string lastNotifiedText = string.Empty; + private bool isInitialized; + + protected override async Task OnInitializedAsync() + { + this.text = this.Text ?? string.Empty; + this.lastParameterText = this.text; + this.lastNotifiedText = this.text; + this.debounceTimer.AutoReset = false; + this.debounceTimer.Interval = this.DebounceTime.TotalMilliseconds; + this.debounceTimer.Elapsed += (_, _) => + { + this.debounceTimer.Stop(); + if (this.text == this.lastNotifiedText) + return; + + this.lastNotifiedText = this.text; + this.InvokeAsync(async () => await this.TextChanged.InvokeAsync(this.text)); + this.InvokeAsync(async () => await this.WhenTextChangedAsync(this.text)); + }; + + this.isInitialized = true; + await base.OnInitializedAsync(); + } + + protected override async Task OnParametersSetAsync() + { + // Ensure the timer uses the latest debouncing interval: + if (!this.isInitialized) + return; + + if(Math.Abs(this.debounceTimer.Interval - this.DebounceTime.TotalMilliseconds) > 1) + this.debounceTimer.Interval = this.DebounceTime.TotalMilliseconds; + + // Only sync when the parent's parameter actually changed since the last change: + if (this.Text != this.lastParameterText) + { + this.text = this.Text ?? string.Empty; + this.lastParameterText = this.text; + } + + this.debounceTimer.Stop(); + this.debounceTimer.Start(); + + await base.OnParametersSetAsync(); + } +} diff --git a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor index 85e6e6eff..421dae839 100644 --- a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor +++ b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor @@ -1,5 +1,6 @@ @using AIStudio.Provider @using AIStudio.Provider.SelfHosted +@using AIStudio.Tools.Rust @inherits MSGComponentBase @@ -7,7 +8,7 @@ @* ReSharper disable once CSharpWarnings::CS8974 *@ - + @foreach (LLMProviders provider in Enum.GetValues(typeof(LLMProviders))) { if (provider.ProvideEmbeddingAPI() || provider is LLMProviders.NONE) @@ -22,7 +23,7 @@ @T("Create account") - + @if (this.DataLLMProvider.IsAPIKeyNeeded(this.DataHost)) { @@ -71,15 +72,14 @@ AdornmentColor="Color.Info" Validation="@this.ValidateManuallyModel" UserAttributes="@SPELLCHECK_ATTRIBUTES" - HelperText="@T("Currently, we cannot query the embedding models for the selected provider and/or host. Therefore, please enter the model name manually.")" - /> + HelperText="@T("Currently, we cannot query the embedding models for the selected provider and/or host. Therefore, please enter the model name manually.")"/> } else { @T("Load") - @if(this.availableModels.Count is 0) + @if (this.availableModels.Count is 0) { @T("No models loaded or available.") @@ -122,9 +122,13 @@ AdornmentIcon="@Icons.Material.Filled.Lightbulb" AdornmentColor="Color.Info" Validation="@this.providerValidation.ValidatingInstanceName" - UserAttributes="@SPELLCHECK_ATTRIBUTES" - /> - + UserAttributes="@SPELLCHECK_ATTRIBUTES"/> + + @T("For better embeddings and less storage usage, it's recommended to use a custom tokenizer to enable a more accurate token count.") + + @if (this.DataModel != default){ + + } @@ -133,7 +137,7 @@ @T("Cancel") - @if(this.IsEditing) + @if (this.IsEditing) { @T("Update") } @@ -143,4 +147,4 @@ } - \ No newline at end of file + diff --git a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs index 6520b7ee7..b45d687bc 100644 --- a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs +++ b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs @@ -1,3 +1,4 @@ +using AIStudio.Chat; using AIStudio.Components; using AIStudio.Provider; using AIStudio.Settings; @@ -89,6 +90,7 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId private string dataAPIKeyStorageIssue = string.Empty; private string dataEditingPreviousInstanceName = string.Empty; private string dataLoadingModelsIssue = string.Empty; + private string dataFilePath = string.Empty; // We get the form reference from Blazor code to validate it manually: private MudForm form = null!; @@ -96,7 +98,7 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId private readonly List availableModels = new(); private readonly Encryption encryption = Program.ENCRYPTION; private readonly ProviderValidation providerValidation; - + public EmbeddingProviderDialog() { this.providerValidation = new() @@ -264,6 +266,13 @@ private async Task OnAPIKeyChanged(string apiKey) await this.form.Validate(); } } + + private async Task OnDataFilePathChanged(string filePath) + { + await this.RustService.ValidateAndStoreTokenizer(this.DataModel.DisplayName, filePath); + } + + private void OnHostChanged(Host selectedHost) { @@ -307,4 +316,4 @@ private async Task ReloadModels() }; private bool IsNoneProvider => this.DataLLMProvider is LLMProviders.NONE; -} \ No newline at end of file +} diff --git a/app/MindWork AI Studio/Pages/Information.razor b/app/MindWork AI Studio/Pages/Information.razor index b7b9aea41..665afad69 100644 --- a/app/MindWork AI Studio/Pages/Information.razor +++ b/app/MindWork AI Studio/Pages/Information.razor @@ -290,6 +290,8 @@ + + diff --git a/app/MindWork AI Studio/Plugins/languages/de-de-43065dbc-78d0-45b7-92be-f14c2926e2dc/plugin.lua b/app/MindWork AI Studio/Plugins/languages/de-de-43065dbc-78d0-45b7-92be-f14c2926e2dc/plugin.lua index e9571a6c8..70c61528b 100644 --- a/app/MindWork AI Studio/Plugins/languages/de-de-43065dbc-78d0-45b7-92be-f14c2926e2dc/plugin.lua +++ b/app/MindWork AI Studio/Plugins/languages/de-de-43065dbc-78d0-45b7-92be-f14c2926e2dc/plugin.lua @@ -1821,6 +1821,9 @@ UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3403290862"] = "Der ausge -- Select a provider first UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3654197869"] = "Wähle zuerst einen Anbieter aus" +-- Estimated amount of tokens: +UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T377990776"] = "Geschätzte Anzahl an Tokens:" + -- Start new chat in workspace "{0}" UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3928697643"] = "Neuen Chat im Arbeitsbereich \"{0}\" starten" @@ -5400,6 +5403,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1019424746"] = "Startprotokollda -- Browse AI Studio's source code on GitHub — we welcome your contributions. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1107156991"] = "Sehen Sie sich den Quellcode von AI Studio auf GitHub an – wir freuen uns über ihre Beiträge." +-- The Tokenizer library serves as the base framework for integrating the DeepSeek tokenizer. +UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1132433749"] = "Die Tokenizer‑Bibliothek dient als Basis‑Framework für die Integration des DeepSeek‑Tokenizers." + -- ID mismatch: the plugin ID differs from the enterprise configuration ID. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1137744461"] = "ID-Konflikt: Die Plugin-ID stimmt nicht mit der ID der Unternehmenskonfiguration überein." @@ -5640,6 +5646,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T566998575"] = "Dies ist eine Bib -- Used .NET SDK UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T585329785"] = "Verwendetes .NET SDK" +-- We use the DeepSeek Tokenizer to estimate the number of tokens an input will generate. +UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T591393704"] = "Wir verwenden den DeepSeek‑Tokenizer, um die Token‑Anzahl einer Eingabe zu schätzen." + -- This library is used to manage sidecar processes and to ensure that stale or zombie sidecars are detected and terminated. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T633932150"] = "Diese Bibliothek wird verwendet, um Sidecar-Prozesse zu verwalten und sicherzustellen, dass veraltete oder Zombie-Sidecars erkannt und beendet werden." @@ -6666,29 +6675,47 @@ UI_TEXT_CONTENT["AISTUDIO::TOOLS::RAG::RAGPROCESSES::AISRCSELWITHRETCTXVAL::T304 -- AI-based data source selection with AI retrieval context validation UI_TEXT_CONTENT["AISTUDIO::TOOLS::RAG::RAGPROCESSES::AISRCSELWITHRETCTXVAL::T3775725978"] = "KI-basierte Datenquellen-Auswahl mit Validierung des Abrufkontexts" --- Executable Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T2217313358"] = "Ausführbare Dateien" +-- Text +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1041509726"] = "Text" + +-- Office Files +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1063218378"] = "Office-Dateien" + +-- Executable +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1364437037"] = "Ausführbare Dateien" + +-- Mail +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1399880782"] = "E-Mail" + +-- Source like +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1487238587"] = "Source Code ähnlich" + +-- Image +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1494001562"] = "Bild" + +-- Video +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1533528076"] = "Video" --- All Source Code Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T2460199369"] = "Alle Quellcodedateien" +-- Source Code +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1569048941"] = "Quellcode" --- All Audio Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T2575722901"] = "Alle Audiodateien" +-- Config +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1779622119"] = "Konfiguration" --- All Video Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T2850789856"] = "Alle Videodateien" +-- Audio +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T2291602489"] = "Audio" --- PDF Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T3108466742"] = "PDF-Dateien" +-- Custom +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T2502277006"] = "Benutzerdefiniert" --- All Image Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T4086723714"] = "Alle Bilddateien" +-- Media +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T3507473059"] = "Medien" --- Text Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T639143005"] = "Textdateien" +-- Source like prefix +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T378481461"] = "Source Code ähnlicher Prefix" --- All Office Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T709668067"] = "Alle Office-Dateien" +-- Document +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T4165204724"] = "Dokument" -- Pandoc Installation UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::PANDOCAVAILABILITYSERVICE::T185447014"] = "Pandoc-Installation" diff --git a/app/MindWork AI Studio/Plugins/languages/en-us-97dfb1ba-50c4-4440-8dfa-6575daf543c8/plugin.lua b/app/MindWork AI Studio/Plugins/languages/en-us-97dfb1ba-50c4-4440-8dfa-6575daf543c8/plugin.lua index 71f6c65ad..d792e90d7 100644 --- a/app/MindWork AI Studio/Plugins/languages/en-us-97dfb1ba-50c4-4440-8dfa-6575daf543c8/plugin.lua +++ b/app/MindWork AI Studio/Plugins/languages/en-us-97dfb1ba-50c4-4440-8dfa-6575daf543c8/plugin.lua @@ -1821,6 +1821,9 @@ UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3403290862"] = "The selec -- Select a provider first UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3654197869"] = "Select a provider first" +-- Estimated amount of tokens: +UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T377990776"] = "Estimated amount of tokens:" + -- Start new chat in workspace "{0}" UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3928697643"] = "Start new chat in workspace \"{0}\"" @@ -5400,6 +5403,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1019424746"] = "Startup log file -- Browse AI Studio's source code on GitHub — we welcome your contributions. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1107156991"] = "Browse AI Studio's source code on GitHub — we welcome your contributions." +-- The Tokenizer library serves as the base framework for integrating the DeepSeek tokenizer. +UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1132433749"] = "The Tokenizer library serves as the base framework for integrating the DeepSeek tokenizer." + -- ID mismatch: the plugin ID differs from the enterprise configuration ID. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1137744461"] = "ID mismatch: the plugin ID differs from the enterprise configuration ID." @@ -5640,6 +5646,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T566998575"] = "This is a library -- Used .NET SDK UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T585329785"] = "Used .NET SDK" +-- We use the DeepSeek Tokenizer to estimate the number of tokens an input will generate. +UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T591393704"] = "We use the DeepSeek Tokenizer to estimate the number of tokens an input will generate." + -- This library is used to manage sidecar processes and to ensure that stale or zombie sidecars are detected and terminated. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T633932150"] = "This library is used to manage sidecar processes and to ensure that stale or zombie sidecars are detected and terminated." @@ -6666,29 +6675,47 @@ UI_TEXT_CONTENT["AISTUDIO::TOOLS::RAG::RAGPROCESSES::AISRCSELWITHRETCTXVAL::T304 -- AI-based data source selection with AI retrieval context validation UI_TEXT_CONTENT["AISTUDIO::TOOLS::RAG::RAGPROCESSES::AISRCSELWITHRETCTXVAL::T3775725978"] = "AI-based data source selection with AI retrieval context validation" --- Executable Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T2217313358"] = "Executable Files" +-- Text +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1041509726"] = "Text" + +-- Office Files +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1063218378"] = "Office Files" + +-- Executable +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1364437037"] = "Executable" + +-- Mail +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1399880782"] = "Mail" + +-- Source like +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1487238587"] = "Source like" + +-- Image +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1494001562"] = "Image" + +-- Video +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1533528076"] = "Video" --- All Source Code Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T2460199369"] = "All Source Code Files" +-- Source Code +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1569048941"] = "Source Code" --- All Audio Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T2575722901"] = "All Audio Files" +-- Config +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T1779622119"] = "Config" --- All Video Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T2850789856"] = "All Video Files" +-- Audio +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T2291602489"] = "Audio" --- PDF Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T3108466742"] = "PDF Files" +-- Custom +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T2502277006"] = "Custom" --- All Image Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T4086723714"] = "All Image Files" +-- Media +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T3507473059"] = "Media" --- Text Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T639143005"] = "Text Files" +-- Source like prefix +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T378481461"] = "Source like prefix" --- All Office Files -UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T709668067"] = "All Office Files" +-- Document +UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T4165204724"] = "Document" -- Pandoc Installation UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::PANDOCAVAILABILITYSERVICE::T185447014"] = "Pandoc Installation" diff --git a/app/MindWork AI Studio/Tools/PandocExport.cs b/app/MindWork AI Studio/Tools/PandocExport.cs index 27e5244e5..e57afdd80 100644 --- a/app/MindWork AI Studio/Tools/PandocExport.cs +++ b/app/MindWork AI Studio/Tools/PandocExport.cs @@ -2,6 +2,7 @@ using AIStudio.Chat; using AIStudio.Dialogs; using AIStudio.Tools.PluginSystem; +using AIStudio.Tools.Rust; using AIStudio.Tools.Services; using DialogOptions = AIStudio.Dialogs.DialogOptions; @@ -16,7 +17,7 @@ public static class PandocExport public static async Task ToMicrosoftWord(RustService rustService, IDialogService dialogService, string dialogTitle, IContent markdownContent) { - var response = await rustService.SaveFile(dialogTitle, new("Microsoft Word", ["docx"])); + var response = await rustService.SaveFile(dialogTitle, [FileTypes.MS_WORD]); if (response.UserCancelled) { LOGGER.LogInformation("User cancelled the save dialog."); diff --git a/app/MindWork AI Studio/Tools/Rust/FileType.cs b/app/MindWork AI Studio/Tools/Rust/FileType.cs new file mode 100644 index 000000000..c333a6913 --- /dev/null +++ b/app/MindWork AI Studio/Tools/Rust/FileType.cs @@ -0,0 +1,41 @@ +namespace AIStudio.Tools.Rust; + +/// +/// Represents a file type that can optionally contain child file types. +/// Use the static helpers , and to build readable trees. +/// +/// Display name of the type (e.g., "Document"). +/// File extensions belonging to this type (without dot). +/// Nested file types that are included when this type is selected. +public sealed record FileType(string FilterName, string[] FilterExtensions, IReadOnlyList Children) +{ + /// + /// Factory for a leaf node. + /// Example: FileType.Leaf(".NET", "cs", "razor") + /// + public static FileType Leaf(string name, params string[] extensions) => + new(name, extensions, []); + + /// + /// Factory for a parent node that only has children. + /// Example: FileType.Parent("Source Code", dotnet, java) + /// + public static FileType Parent(string name, params FileType[]? children) => + new(name, [], children ?? []); + + /// + /// Factory for a composite node that has its own extensions in addition to children. + /// + public static FileType Composite(string name, string[] extensions, params FileType[] children) => + new(name, extensions, children); + + /// + /// Collects all extensions for this type, including children. + /// + public IEnumerable FlattenExtensions() + { + return this.FilterExtensions + .Concat(this.Children.SelectMany(child => child.FlattenExtensions())) + .Distinct(StringComparer.OrdinalIgnoreCase); + } +} \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/Rust/FileTypeFilter.cs b/app/MindWork AI Studio/Tools/Rust/FileTypeFilter.cs index d93f44e01..f4cd1c7e5 100644 --- a/app/MindWork AI Studio/Tools/Rust/FileTypeFilter.cs +++ b/app/MindWork AI Studio/Tools/Rust/FileTypeFilter.cs @@ -1,125 +1,49 @@ -// ReSharper disable NotAccessedPositionalProperty.Global - -using AIStudio.Tools.PluginSystem; - namespace AIStudio.Tools.Rust; /// -/// Represents a file type filter for file selection dialogs. +/// Represents a file type that can optionally contain child file types. +/// Use the static helpers , and to build readable trees. /// -/// The name of the filter. -/// The file extensions associated with the filter. -public readonly record struct FileTypeFilter(string FilterName, string[] FilterExtensions) +/// Display name of the type (e.g., "Document"). +/// File extensions belonging to this type (without dot). +/// Nested file types that are included when this type is selected. +public sealed record FileTypeFilter(string FilterName, string[] FilterExtensions, IReadOnlyList Children) { - private static string TB(string fallbackEN) => I18N.I.T(fallbackEN, typeof(FileTypeFilter).Namespace, nameof(FileTypeFilter)); - - private static string[] AllowedSourceLikeFileNames => - [ - "Dockerfile", - "Containerfile", - "Jenkinsfile", - "Makefile", - "GNUmakefile", - "Procfile", - "Vagrantfile", - "Tiltfile", - "Justfile", - "Brewfile", - "Caddyfile", - "Gemfile", - "Podfile", - "Fastfile", - "Appfile", - "Rakefile", - "Dangerfile", - "BUILD", - "WORKSPACE", - "BUCK", - ]; - - private static string[] AllowedSourceLikeFileNamePrefixes => - [ - "Dockerfile", - "Containerfile", - "Jenkinsfile", - "Procfile", - "Caddyfile", - ]; - - public static bool IsAllowedSourceLikeFileName(string filePath) + /// + /// Factory for a leaf node. + /// Example: FileType.Leaf(".NET", "cs", "razor") + /// + public static FileTypeFilter Leaf(string name, params string[] extensions) => + new(name, extensions, []); + + /// + /// Factory for a parent node that only has children. + /// Example: FileType.Parent("Source Code", dotnet, java) + /// + public static FileTypeFilter Parent(string name, params FileTypeFilter[]? children) => + new(name, [], children ?? []); + + /// + /// Factory for a composite node that has its own extensions in addition to children. + /// + public static FileTypeFilter Composite(string name, string[] extensions, params FileTypeFilter[] children) => + new(name, extensions, children); + + /// + /// Collects all extensions for this type, including children. + /// + public IEnumerable FlattenExtensions() { - var fileName = Path.GetFileName(filePath); - if (string.IsNullOrWhiteSpace(fileName)) - return false; - - if (AllowedSourceLikeFileNames.Any(name => string.Equals(name, fileName, StringComparison.OrdinalIgnoreCase))) + return this.FilterExtensions + .Concat(this.Children.SelectMany(child => child.FlattenExtensions())) + .Distinct(StringComparer.OrdinalIgnoreCase); + } + + public bool ContainsType(FileTypeFilter target) + { + if (this == target) return true; - return AllowedSourceLikeFileNamePrefixes.Any(prefix => fileName.StartsWith(prefix, StringComparison.OrdinalIgnoreCase)); + return this.Children.Any(child => child.ContainsType(target)); } - - public static FileTypeFilter PDF => new(TB("PDF Files"), ["pdf"]); - - public static FileTypeFilter Text => new(TB("Text Files"), ["txt", "md"]); - - public static FileTypeFilter AllOffice => new(TB("All Office Files"), ["docx", "xlsx", "pptx", "doc", "xls", "ppt", "pdf"]); - - public static FileTypeFilter AllImages => new(TB("All Image Files"), ["jpg", "jpeg", "png", "gif", "bmp", "tiff", "svg", "webp", "heic"]); - - public static FileTypeFilter AllVideos => new(TB("All Video Files"), ["mp4", "m4v", "avi", "mkv", "mov", "wmv", "flv", "webm"]); - - public static FileTypeFilter AllAudio => new(TB("All Audio Files"), ["mp3", "wav", "wave", "aac", "flac", "ogg", "m4a", "wma", "alac", "aiff", "m4b"]); - - public static FileTypeFilter AllSourceCode => new(TB("All Source Code Files"), - [ - // .NET - "cs", "vb", "fs", "razor", "aspx", "cshtml", "csproj", - - // Java: - "java", - - // Python: - "py", - - // JavaScript/TypeScript: - "js", "ts", - - // C/C++: - "c", "cpp", "h", "hpp", - - // Ruby: - "rb", - - // Go: - "go", - - // Rust: - "rs", - - // Lua: - "lua", - - // PHP: - "php", - - // HTML/CSS: - "html", "css", - - // Swift/Kotlin: - "swift", "kt", - - // Shell scripts: - "sh", "bash", - - // Logging files: - "log", - - // JSON/YAML/XML: - "json", "yaml", "yml", "xml", - - // Config files: - "ini", "cfg", "toml", "plist", - ]); - - public static FileTypeFilter Executables => new(TB("Executable Files"), ["exe", "app", "bin", "appimage"]); } \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/Rust/FileTypes.cs b/app/MindWork AI Studio/Tools/Rust/FileTypes.cs new file mode 100644 index 000000000..789eb7d6b --- /dev/null +++ b/app/MindWork AI Studio/Tools/Rust/FileTypes.cs @@ -0,0 +1,130 @@ +using AIStudio.Tools.PluginSystem; + +namespace AIStudio.Tools.Rust; + +/// +/// Central definition of supported file types with parent/child relationships and helpers +/// to build extension whitelists (e.g., for file pickers or validation). +/// +public static class FileTypes +{ + private static string TB(string fallbackEn) => I18N.I.T(fallbackEn, typeof(FileTypeFilter).Namespace, nameof(FileTypeFilter)); + + // Keep SOURCE_LIKE in the same leaf style as the other file types. + // These values are not sufficient for Dockerfile-style files without extensions, + // therefore IsAllowedSourceLikeFileName is still required for real matching. + public static readonly FileTypeFilter SOURCE_LIKE_FILE_NAMES = FileTypeFilter.Leaf(TB("Source like"), + "Dockerfile", "Containerfile", "Jenkinsfile", "Makefile", "GNUmakefile", "Procfile", "Vagrantfile", + "Tiltfile", "Justfile", "Brewfile", "Caddyfile", "Gemfile", "Podfile", "Fastfile", "Appfile", "Rakefile", "Dangerfile", + "BUILD", "WORKSPACE", "BUCK"); + + public static readonly FileTypeFilter SOURCE_LIKE_FILE_NAME_PREFIXES = FileTypeFilter.Leaf(TB("Source like prefix"), + "Dockerfile", "Containerfile", "Jenkinsfile", "Procfile", "Caddyfile"); + + // Source code hierarchy: SourceCode -> (.NET, Java, Python, Web, C/C++, Config, ...) + public static readonly FileTypeFilter DOTNET = FileTypeFilter.Leaf(".NET", "cs", "razor", "vb", "fs", "aspx", "cshtml", "csproj"); + public static readonly FileTypeFilter JAVA = FileTypeFilter.Leaf("Java", "java"); + public static readonly FileTypeFilter PYTHON = FileTypeFilter.Leaf("Python", "py"); + public static readonly FileTypeFilter JAVASCRIPT = FileTypeFilter.Leaf("JavaScript/TypeScript", "js", "ts"); + public static readonly FileTypeFilter CFAMILY = FileTypeFilter.Leaf("C/C++", "c", "cpp", "h", "hpp"); + public static readonly FileTypeFilter RUBY = FileTypeFilter.Leaf("Ruby", "rb"); + public static readonly FileTypeFilter GO = FileTypeFilter.Leaf("Go", "go"); + public static readonly FileTypeFilter RUST = FileTypeFilter.Leaf("Rust", "rs"); + public static readonly FileTypeFilter LUA = FileTypeFilter.Leaf("Lua", "lua"); + public static readonly FileTypeFilter PHP = FileTypeFilter.Leaf("PHP", "php"); + public static readonly FileTypeFilter WEB = FileTypeFilter.Leaf("HTML/CSS", "html", "css"); + public static readonly FileTypeFilter APP = FileTypeFilter.Leaf("Swift/Kotlin", "swift", "kt"); + public static readonly FileTypeFilter SHELL = FileTypeFilter.Leaf("Shell", "sh", "bash", "zsh"); + public static readonly FileTypeFilter LOG = FileTypeFilter.Leaf("Log", "log"); + public static readonly FileTypeFilter JSON = FileTypeFilter.Leaf("JSON", "json"); + public static readonly FileTypeFilter XML = FileTypeFilter.Leaf("XML", "xml"); + public static readonly FileTypeFilter YAML = FileTypeFilter.Leaf("YAML", "yaml", "yml"); + public static readonly FileTypeFilter CONFIG = FileTypeFilter.Leaf(TB("Config"), "ini", "cfg", "toml", "plist"); + + public static readonly FileTypeFilter SOURCE_CODE = FileTypeFilter.Parent(TB("Source Code"), + DOTNET, JAVA, PYTHON, JAVASCRIPT, CFAMILY, RUBY, GO, RUST, LUA, PHP, WEB, APP, SHELL, LOG, JSON, XML, YAML, CONFIG, SOURCE_LIKE_FILE_NAMES, SOURCE_LIKE_FILE_NAME_PREFIXES); + + // Document hierarchy + public static readonly FileTypeFilter PDF = FileTypeFilter.Leaf("PDF", "pdf"); + public static readonly FileTypeFilter TEXT = FileTypeFilter.Leaf(TB("Text"), "txt", "md", "rtf"); + public static readonly FileTypeFilter MS_WORD = FileTypeFilter.Leaf("Microsoft Word", "docx", "doc"); + public static readonly FileTypeFilter WORD = FileTypeFilter.Composite("Word", ["odt"], MS_WORD); + public static readonly FileTypeFilter EXCEL = FileTypeFilter.Leaf("Excel", "xls", "xlsx"); + public static readonly FileTypeFilter POWER_POINT = FileTypeFilter.Leaf("PowerPoint", "ppt", "pptx"); + public static readonly FileTypeFilter MAIL = FileTypeFilter.Leaf(TB("Mail"), "eml", "msg", "mbox"); + + public static readonly FileTypeFilter OFFICE_FILES = FileTypeFilter.Parent(TB("Office Files"), + WORD, EXCEL, POWER_POINT, PDF); + public static readonly FileTypeFilter DOCUMENT = FileTypeFilter.Parent(TB("Document"), + TEXT, OFFICE_FILES, SOURCE_CODE, MAIL); + + // Media hierarchy + public static readonly FileTypeFilter IMAGE = FileTypeFilter.Leaf(TB("Image"), + "jpg", "jpeg", "png", "gif", "bmp", "tiff", "svg", "webp", "heic"); + public static readonly FileTypeFilter AUDIO = FileTypeFilter.Leaf(TB("Audio"), + "mp3", "wav", "wave", "aac", "flac", "ogg", "m4a", "wma", "alac", "aiff", "m4b"); + public static readonly FileTypeFilter VIDEO = FileTypeFilter.Leaf(TB("Video"), + "mp4", "m4v", "avi", "mkv", "mov", "wmv", "flv", "webm"); + + public static readonly FileTypeFilter MEDIA = FileTypeFilter.Parent(TB("Media"), IMAGE, AUDIO, VIDEO); + + // Other standalone types + public static readonly FileTypeFilter EXECUTABLES = FileTypeFilter.Leaf(TB("Executable"), "exe", "app", "bin", "appimage"); + + public static FileTypeFilter? AsOneFileType(params FileTypeFilter[]? types) + { + if (types == null || types.Length == 0) + return null; + + if (types.Length == 1) return types[0]; + + return FileTypeFilter.Composite(TB("Custom"), OnlyAllowTypes(types)); + } + + public static string[] OnlyAllowTypes(params FileTypeFilter[] types) + { + if (types.Length == 0) + return []; + + return types + .Where(t => t != SOURCE_LIKE_FILE_NAMES && t != SOURCE_LIKE_FILE_NAME_PREFIXES) + .SelectMany(t => t.FlattenExtensions()) + .Select(ext => ext.ToLowerInvariant()) + .Distinct(StringComparer.OrdinalIgnoreCase) + .ToArray(); + } + + /// + /// Validates a file path against the provided filters. + /// Supports extension-based matching and source-like file names (e.g. Dockerfile). + /// + public static bool IsAllowedPath(string filePath, params FileTypeFilter[]? types) + { + if (types == null || types.Length == 0 || string.IsNullOrWhiteSpace(filePath)) + return false; + + var extension = Path.GetExtension(filePath).TrimStart('.'); + if (!string.IsNullOrWhiteSpace(extension)) + { + if (OnlyAllowTypes(types).Contains(extension, StringComparer.OrdinalIgnoreCase)) + return true; + } + + var fileName = Path.GetFileName(filePath); + if (string.IsNullOrWhiteSpace(fileName)) + { + return false; + } + + if (types.Any(t => t.ContainsType(SOURCE_LIKE_FILE_NAMES))) + { + if (SOURCE_LIKE_FILE_NAMES.FilterExtensions.Contains(fileName)) return true; + } + + if (types.Any(t => t.ContainsType(SOURCE_LIKE_FILE_NAME_PREFIXES))){ + if (SOURCE_LIKE_FILE_NAME_PREFIXES.FilterExtensions.Any(prefix => fileName.StartsWith(prefix, StringComparison.OrdinalIgnoreCase))) return true; + } + + return false; + } +} diff --git a/app/MindWork AI Studio/Tools/Rust/SaveFileOptions.cs b/app/MindWork AI Studio/Tools/Rust/SaveFileOptions.cs index 107e581a7..f1300ac17 100644 --- a/app/MindWork AI Studio/Tools/Rust/SaveFileOptions.cs +++ b/app/MindWork AI Studio/Tools/Rust/SaveFileOptions.cs @@ -6,5 +6,5 @@ public class SaveFileOptions public PreviousFile? PreviousFile { get; init; } - public FileTypeFilter? Filter { get; init; } + public FileType? Filter { get; init; } } \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/Rust/SelectFileOptions.cs b/app/MindWork AI Studio/Tools/Rust/SelectFileOptions.cs index 28d16809a..fac7d5f4e 100644 --- a/app/MindWork AI Studio/Tools/Rust/SelectFileOptions.cs +++ b/app/MindWork AI Studio/Tools/Rust/SelectFileOptions.cs @@ -6,5 +6,5 @@ public sealed class SelectFileOptions public PreviousFile? PreviousFile { get; init; } - public FileTypeFilter? Filter { get; init; } + public FileType? Filter { get; init; } } \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/Rust/TokenCountInfo.cs b/app/MindWork AI Studio/Tools/Rust/TokenCountInfo.cs new file mode 100644 index 000000000..c0e491bf4 --- /dev/null +++ b/app/MindWork AI Studio/Tools/Rust/TokenCountInfo.cs @@ -0,0 +1,6 @@ +namespace AIStudio.Tools.Rust; + +public sealed class TokenCountInfo +{ + public int TokenCount { get; set; } +} \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/Rust/TokenizerUploadResponse.cs b/app/MindWork AI Studio/Tools/Rust/TokenizerUploadResponse.cs new file mode 100644 index 000000000..c141ec746 --- /dev/null +++ b/app/MindWork AI Studio/Tools/Rust/TokenizerUploadResponse.cs @@ -0,0 +1,3 @@ +namespace AIStudio.Tools.Rust; + +public readonly record struct TokenizerUploadResponse(int Success, string Response); \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/Services/RustService.FileSystem.cs b/app/MindWork AI Studio/Tools/Services/RustService.FileSystem.cs index 4a498b016..7e7c24286 100644 --- a/app/MindWork AI Studio/Tools/Services/RustService.FileSystem.cs +++ b/app/MindWork AI Studio/Tools/Services/RustService.FileSystem.cs @@ -17,13 +17,13 @@ public async Task SelectDirectory(string title, stri return await result.Content.ReadFromJsonAsync(this.jsonRustSerializerOptions); } - public async Task SelectFile(string title, FileTypeFilter? filter = null, string? initialFile = null) + public async Task SelectFile(string title, FileTypeFilter[]? filter = null, string? initialFile = null) { var payload = new SelectFileOptions { Title = title, PreviousFile = initialFile is null ? null : new (initialFile), - Filter = filter + Filter = FileTypes.AsOneFileType(filter) }; var result = await this.http.PostAsJsonAsync("/select/file", payload, this.jsonRustSerializerOptions); @@ -36,13 +36,13 @@ public async Task SelectFile(string title, FileTypeFilter return await result.Content.ReadFromJsonAsync(this.jsonRustSerializerOptions); } - public async Task SelectFiles(string title, FileTypeFilter? filter = null, string? initialFile = null) + public async Task SelectFiles(string title, FileTypeFilter[]? filter = null, string? initialFile = null) { var payload = new SelectFileOptions { Title = title, PreviousFile = initialFile is null ? null : new (initialFile), - Filter = filter + Filter = FileTypes.AsOneFileType(filter) }; var result = await this.http.PostAsJsonAsync("/select/files", payload, this.jsonRustSerializerOptions); @@ -63,13 +63,13 @@ public async Task SelectFiles(string title, FileTypeFilt /// An optional initial file path to pre-fill in the dialog. /// A object containing information about whether the user canceled the /// operation and whether the select operation was successful. - public async Task SaveFile(string title, FileTypeFilter? filter = null, string? initialFile = null) + public async Task SaveFile(string title, FileTypeFilter[]? filter = null, string? initialFile = null) { var payload = new SaveFileOptions { Title = title, PreviousFile = initialFile is null ? null : new (initialFile), - Filter = filter + Filter = FileTypes.AsOneFileType(filter) }; var result = await this.http.PostAsJsonAsync("/save/file", payload, this.jsonRustSerializerOptions); @@ -81,4 +81,21 @@ public async Task SaveFile(string title, FileTypeFilter? filte return await result.Content.ReadFromJsonAsync(this.jsonRustSerializerOptions); } + + public async Task ValidateAndStoreTokenizer(string? modelId, string filePath) + { + var result = await this.http.PostAsJsonAsync("/tokenizer/val-and-store", new { + model_id = modelId, + file_path = filePath, + }, this.jsonRustSerializerOptions); + + if (!result.IsSuccessStatusCode) + { + this.logger!.LogError($"Failed to validate and store the tokenizer '{result.StatusCode}'"); + return new TokenizerUploadResponse(-1, "An error occured while validating and storing the tokenizer"); + } + + return await result.Content.ReadFromJsonAsync(this.jsonRustSerializerOptions); + } + } \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/Services/RustService.Tokenizer.cs b/app/MindWork AI Studio/Tools/Services/RustService.Tokenizer.cs new file mode 100644 index 000000000..e01272dbe --- /dev/null +++ b/app/MindWork AI Studio/Tools/Services/RustService.Tokenizer.cs @@ -0,0 +1,27 @@ +using AIStudio.Tools.Rust; + +namespace AIStudio.Tools.Services; + +public sealed partial class RustService +{ + public async Task GetTokenCount(string text) + { + try + { + var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5)); + var payload = new { text }; + var response = await this.http.PostAsJsonAsync("/system/tokenizer/count", payload, this.jsonRustSerializerOptions, cts.Token); + response.EnsureSuccessStatusCode(); + return await response.Content.ReadFromJsonAsync(this.jsonRustSerializerOptions, cancellationToken: cts.Token); + } + catch (Exception e) + { + if(this.logger is not null) + this.logger.LogError(e, "Error while getting token count from Rust service."); + else + Console.WriteLine($"Error while getting token count from Rust service: '{e}'."); + + return null; + } + } +} \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/Validation/FileExtensionValidation.cs b/app/MindWork AI Studio/Tools/Validation/FileExtensionValidation.cs index 02a978d1a..d38a8c086 100644 --- a/app/MindWork AI Studio/Tools/Validation/FileExtensionValidation.cs +++ b/app/MindWork AI Studio/Tools/Validation/FileExtensionValidation.cs @@ -43,8 +43,7 @@ public enum UseCase /// True if valid, false if invalid (error/warning already sent via MessageBus). public static async Task IsExtensionValidWithNotifyAsync(UseCase useCae, string filePath, bool validateMediaFileTypes = true, Settings.Provider? provider = null) { - var ext = Path.GetExtension(filePath).TrimStart('.').ToLowerInvariant(); - if(FileTypeFilter.Executables.FilterExtensions.Contains(ext)) + if (FileTypes.IsAllowedPath(filePath, FileTypes.EXECUTABLES)) { await MessageBus.INSTANCE.SendError(new( Icons.Material.Filled.AppBlocking, @@ -53,7 +52,7 @@ await MessageBus.INSTANCE.SendError(new( } var capabilities = provider?.GetModelCapabilities() ?? new(); - if (FileTypeFilter.AllImages.FilterExtensions.Contains(ext)) + if (FileTypes.IsAllowedPath(filePath, FileTypes.IMAGE)) { switch (useCae) { @@ -88,7 +87,7 @@ await MessageBus.INSTANCE.SendWarning(new( } } - if(FileTypeFilter.AllVideos.FilterExtensions.Contains(ext)) + if (FileTypes.IsAllowedPath(filePath, FileTypes.VIDEO)) { await MessageBus.INSTANCE.SendWarning(new( Icons.Material.Filled.FeaturedVideo, @@ -96,7 +95,7 @@ await MessageBus.INSTANCE.SendWarning(new( return false; } - if(FileTypeFilter.AllAudio.FilterExtensions.Contains(ext)) + if (FileTypes.IsAllowedPath(filePath, FileTypes.AUDIO)) { await MessageBus.INSTANCE.SendWarning(new( Icons.Material.Filled.AudioFile, @@ -123,7 +122,7 @@ await MessageBus.INSTANCE.SendError(new( return false; } - if (!Array.Exists(FileTypeFilter.AllImages.FilterExtensions, x => x.Equals(ext, StringComparison.OrdinalIgnoreCase))) + if (FileTypes.IsAllowedPath(filePath, FileTypes.IMAGE)) { await MessageBus.INSTANCE.SendError(new( Icons.Material.Filled.ImageNotSupported, diff --git a/runtime/src/lib.rs b/runtime/src/lib.rs index 1b13e0991..102efbe2e 100644 --- a/runtime/src/lib.rs +++ b/runtime/src/lib.rs @@ -17,4 +17,5 @@ pub mod qdrant; pub mod certificate_factory; pub mod runtime_api_token; pub mod stale_process_cleanup; -mod sidecar_types; \ No newline at end of file +mod sidecar_types; +pub mod tokenizer; \ No newline at end of file diff --git a/runtime/src/main.rs b/runtime/src/main.rs index 00a7ba905..a210de540 100644 --- a/runtime/src/main.rs +++ b/runtime/src/main.rs @@ -11,7 +11,7 @@ use mindwork_ai_studio::environment::is_dev; use mindwork_ai_studio::log::init_logging; use mindwork_ai_studio::metadata::MetaData; use mindwork_ai_studio::runtime_api::start_runtime_api; - +use mindwork_ai_studio::tokenizer::{init_tokenizer}; #[tokio::main] async fn main() { @@ -43,8 +43,12 @@ async fn main() { info!("Running in production mode."); } + if let Err(e) = init_tokenizer() { + warn!(Source = "Tokenizer"; "Error during the initialisation of the tokenizer: {}", e); + } + generate_runtime_certificate(); start_runtime_api(); start_tauri(); -} \ No newline at end of file +} diff --git a/runtime/src/runtime_api.rs b/runtime/src/runtime_api.rs index 64bc8174a..6ceeb1e2b 100644 --- a/runtime/src/runtime_api.rs +++ b/runtime/src/runtime_api.rs @@ -89,6 +89,7 @@ pub fn start_runtime_api() { crate::file_data::extract_data, crate::log::get_log_paths, crate::log::log_event, + crate::tokenizer::tokenizer_count, crate::app_window::register_shortcut, crate::app_window::validate_shortcut, crate::app_window::suspend_shortcuts, diff --git a/runtime/src/tokenizer.rs b/runtime/src/tokenizer.rs new file mode 100644 index 000000000..3614b3968 --- /dev/null +++ b/runtime/src/tokenizer.rs @@ -0,0 +1,54 @@ +use std::fs; +use std::path::{PathBuf}; +use std::sync::OnceLock; +use rocket::{post}; +use rocket::serde::json::Json; +use rocket::serde::Serialize; +use serde::Deserialize; +use tokenizers::Error; +use tokenizers::tokenizer::Tokenizer; +use crate::api_token::APIToken; + +static TOKENIZER: OnceLock = OnceLock::new(); + +static TEXT: &str = ""; + +pub fn init_tokenizer() -> Result<(), Error>{ + let mut target_dir = PathBuf::from("target"); + target_dir.push("tokenizers"); + fs::create_dir_all(&target_dir)?; + + let mut local_tokenizer_path = target_dir.clone(); + local_tokenizer_path.push("tokenizer.json"); + + TOKENIZER.set(Tokenizer::from_file(local_tokenizer_path)?).expect("Could not set the tokenizer."); + Ok(()) +} + +pub fn get_token_count(mut text: &str) -> usize { + if text.is_empty() { + text = TEXT; + } + match TOKENIZER.get().unwrap().encode(text, true) { + Ok(encoding) => encoding.len(), + Err(_) => 0, + } +} + +#[derive(Deserialize)] +pub struct SetTokenText { + pub text: String, +} + +#[derive(Serialize)] +pub struct GetTokenCount{ + token_count: usize, +} + + +#[post("/system/tokenizer/count", data = "")] +pub fn tokenizer_count(_token: APIToken, req: Json) -> Json { + Json(GetTokenCount { + token_count: get_token_count(&req.text), + }) +} \ No newline at end of file