diff --git a/internal/message/handler.go b/internal/message/handler.go index f693f43..7c973ef 100644 --- a/internal/message/handler.go +++ b/internal/message/handler.go @@ -694,7 +694,8 @@ func (h *Handler) decorateEvent(m Message) error { e.Event.CostInUsd = cost + completionCost if e.CostMap != nil { - newCost, err := provider.EstimateTotalCostWithCostMaps(e.Event.Model, tks, completiontks, 1000, e.CostMap.PromptCostPerModel, e.CostMap.CompletionCostPerModel) + model := openai.ModelWithContextLength(e.Event.Model, int64(tks+completiontks)) + newCost, err := provider.EstimateTotalCostWithCostMaps(model, tks, completiontks, 1000, e.CostMap.PromptCostPerModel, e.CostMap.CompletionCostPerModel) if err != nil { h.log.Debug("error when estimating total cost with cost maps", zap.Error(err)) telemetry.Incr("bricksllm.proxy.decorate_event.estimate_total_cost_with_cost_maps_error", nil, 1) diff --git a/internal/provider/openai/cost.go b/internal/provider/openai/cost.go index ba63fe3..3ac1d99 100644 --- a/internal/provider/openai/cost.go +++ b/internal/provider/openai/cost.go @@ -41,10 +41,14 @@ var OpenAiPerThousandTokenCost = map[string]map[string]float64{ "chatgpt-image-latest": 0.005, "gpt-image-1-mini": 0.002, - "gpt-5.4": 0.005, - "gpt-5.4-mini": 0.00075, - "gpt-5.4-nano": 0.0002, - "gpt-5.4-pro": 0.06, + "gpt-5.4": 0.005, + "gpt-5.4~long": 0.005, + "gpt-5.4~short": 0.0025, + "gpt-5.4-mini": 0.00075, + "gpt-5.4-nano": 0.0002, + "gpt-5.4-pro": 0.06, + "gpt-5.4-pro~long": 0.06, + "gpt-5.4-pro~short": 0.03, "gpt-5.3-codex": 0.00175, @@ -111,6 +115,12 @@ var OpenAiPerThousandTokenCost = map[string]map[string]float64{ "babbage-002": 0.000400, }, "cached-prompt": { + "gpt-5.4": 0.0005, + "gpt-5.4~long": 0.0005, + "gpt-5.4~short": 0.00025, + "gpt-5.4-mini": 0.000075, + "gpt-5.4-nano": 0.00002, + "gpt-image-1.5": 0.00125, "gpt-image-1": 0.00125, "chatgpt-image-latest": 0.00125, @@ -192,10 +202,14 @@ var OpenAiPerThousandTokenCost = map[string]map[string]float64{ "gpt-image-1.5": 0.010, "chatgpt-image-latest": 0.010, - "gpt-5.4": 0.0225, - "gpt-5.4-mini": 0.0045, - "gpt-5.4-nano": 0.00125, - "gpt-5.4-pro": 0.27, + "gpt-5.4": 0.0225, + "gpt-5.4~long": 0.0225, + "gpt-5.4~short": 0.015, + "gpt-5.4-mini": 0.0045, + "gpt-5.4-nano": 0.00125, + "gpt-5.4-pro": 0.27, + "gpt-5.4-pro~long": 0.27, + "gpt-5.4-pro~short": 0.18, "gpt-5.3-codex": 0.014, @@ -346,17 +360,27 @@ var OpenAiCodeInterpreterContainerCost = map[string]float64{ var AllowedTools = []string{ "web_search", "web_search_preview", + "web_search_preview_2025_03_11", "code_interpreter", "file_search", "function", "computer_use_preview", + "computer", + "computer_use", "exec_command", "shell", - "local_shell", + "apply_patch", "filesystem", "patch", + "namespace", + "custom", + "custom_code", + "mcp", + "tool_search", + "image_generation", + "skills", } type tokenCounter interface { @@ -376,6 +400,8 @@ func NewCostEstimator(m map[string]map[string]float64, tc tokenCounter) *CostEst } func (ce *CostEstimator) EstimateTotalCost(model string, promptTks, completionTks int) (float64, error) { + totalTokens := int64(promptTks + completionTks) + model = ModelWithContextLength(model, totalTokens) promptCost, err := ce.EstimatePromptCost(model, promptTks) if err != nil { return 0, err @@ -455,7 +481,8 @@ func (ce *CostEstimator) EstimateChatCompletionPromptCostWithTokenCounts(r *goop return 0, 0, err } - cost, err := ce.EstimatePromptCost(r.Model, tks) + model := ModelWithContextLength(r.Model, int64(tks)) + cost, err := ce.EstimatePromptCost(model, tks) if err != nil { return 0, 0, err } @@ -473,6 +500,7 @@ func (ce *CostEstimator) EstimateChatCompletionStreamCostWithTokenCounts(model s return 0, 0, err } + model = ModelWithContextLength(model, int64(tks)) cost, err := ce.EstimateCompletionCost(model, tks) if err != nil { return 0, 0, err @@ -797,6 +825,9 @@ func (ce *CostEstimator) EstimateResponseApiTotalCost(model string, usage respon cachedInputTokens := usage.InputTokensDetails.CachedTokens outputTokens := usage.OutputTokens + totalTokens := inputTokens + cachedInputTokens + outputTokens + model = ModelWithContextLength(model, totalTokens) + cachedInputCost, err := ce.estimateResponseApiTokensCost("cached-prompt", model, cachedInputTokens) if err != nil { cachedInputTokens = 0.0 @@ -1055,3 +1086,23 @@ func countTotalTokens(model string, r *goopenai.ChatCompletionRequest, tc tokenC return tks + ftks + mtks, err } + +var modelWithLengthCtx = []string{ + "gpt-5.4", + "gpt-5.4-pro", +} + +func ModelWithContextLength(model string, tokens int64) string { + trimmed := strings.TrimSpace(model) + if slices.Contains(modelWithLengthCtx, trimmed) { + return trimmed + contextLengthSuffixByTokens(tokens) + } + return trimmed +} + +func contextLengthSuffixByTokens(tokens int64) string { + if tokens >= 272000 { + return "~long" + } + return "~short" +} diff --git a/internal/server/web/proxy/middleware.go b/internal/server/web/proxy/middleware.go index c6f328e..e9b26aa 100644 --- a/internal/server/web/proxy/middleware.go +++ b/internal/server/web/proxy/middleware.go @@ -6,6 +6,7 @@ import ( "fmt" "io" "net/http" + "slices" "strconv" "strings" "time" @@ -823,20 +824,20 @@ func getMiddleware(cpm CustomProvidersManager, rm routeManager, pm PoliciesManag return } - //hasNotAllowedTools := false - //for _, tool := range responsesReq.Tools { - // if !slices.Contains(openai.AllowedTools, tool.Type) { - // hasNotAllowedTools = true - // break - // } - //} - // - //if hasNotAllowedTools { - // telemetry.Incr("bricksllm.proxy.get_middleware.tool_not_allowed", nil, 1) - // JSON(c, http.StatusForbidden, "[BricksLLM] one of the tools is not allowed") - // c.Abort() - // return - //} + hasNotAllowedTools := false + for _, tool := range responsesReq.Tools { + if !slices.Contains(openai.AllowedTools, tool.Type) { + hasNotAllowedTools = true + break + } + } + + if hasNotAllowedTools { + telemetry.Incr("bricksllm.proxy.get_middleware.tool_not_allowed", nil, 1) + JSON(c, http.StatusForbidden, "[BricksLLM] one of the tools is not allowed") + c.Abort() + return + } isCreateContainerTool := false var containerMemLimit string