diff --git a/src/pyob/core_utils.py b/src/pyob/core_utils.py index 12e2d75..9939b2b 100644 --- a/src/pyob/core_utils.py +++ b/src/pyob/core_utils.py @@ -343,6 +343,50 @@ def stream_ollama(self, prompt: str, on_chunk) -> str: logger.error(f"Ollama Error: {e}") return response_text + def stream_github_models(self, prompt: str, on_chunk) -> str: + """Fallback to GitHub Models API (Phi-4).""" + token = os.environ.get("GITHUB_TOKEN") + if not token: + return "ERROR_CODE_GITHUB_TOKEN_MISSING" + + # GitHub Models use the Azure AI Inference endpoint + endpoint = "https://models.inference.ai.azure.com/chat/completions" + headers = { + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + } + data = { + "messages": [{"role": "user", "content": prompt}], + "model": "Phi-4", + "stream": True, + "temperature": 0.1, + } + + full_text = "" + try: + response = requests.post( + endpoint, headers=headers, json=data, stream=True, timeout=120 + ) + if response.status_code != 200: + return f"ERROR_CODE_{response.status_code}" + + for line in response.iter_lines(): + if line: + decoded_line = line.decode("utf-8").replace("data: ", "") + if decoded_line == "[DONE]": + break + try: + chunk = json.loads(decoded_line) + content = chunk["choices"][0]["delta"].get("content", "") + if content: + full_text += content + on_chunk() + except Exception: + continue + return full_text + except Exception as e: + return f"ERROR_CODE_EXCEPTION: {e}" + def _stream_single_llm( self, prompt: str, key: str | None = None, context: str = "" ) -> str: @@ -385,10 +429,12 @@ def on_chunk(): response_text = self.stream_gemini(prompt, key, on_chunk) else: if os.environ.get("GITHUB_ACTIONS") == "true": - first_chunk_received[0] = True - return "ERROR_CODE_CLOUD_OLLAMA_BLOCKED" - - response_text = self.stream_ollama(prompt, on_chunk) + logger.info( + "☁️ Gemini limited. Pivoting to GitHub Models (Phi-4)..." + ) + response_text = self.stream_github_models(prompt, on_chunk) + else: + response_text = self.stream_ollama(prompt, on_chunk) except Exception as e: first_chunk_received[0] = True return f"ERROR_CODE_EXCEPTION: {e}" @@ -418,25 +464,18 @@ def get_valid_llm_response(self, prompt: str, validator, context: str = "") -> s available_keys = [ k for k, cooldown in self.key_cooldowns.items() if now > cooldown ] + if not available_keys: if is_cloud: - wait_times = [ - cooldown - now for cooldown in self.key_cooldowns.values() - ] - sleep_duration = max( - 10, min(min(wait_times) if wait_times else 120, 1200) - ) - logger.warning( - f"⏳ CLOUD NOTICE: All keys rate-limited. Retrying Gemini in {int(sleep_duration)}s..." - ) - time.sleep(sleep_duration) - continue - - if not use_ollama: - logger.warning( - "🚫 All Gemini keys rate-limited. Falling back to Local Ollama." - ) - use_ollama = True + # In the cloud, we don't 'use_ollama', we just try the GitHub Models fallback + # which is handled inside _stream_single_llm(key=None) + use_ollama = False + else: + if not use_ollama: + logger.warning( + "🚫 Gemini keys limited. Falling back to Local Ollama." + ) + use_ollama = True else: use_ollama = False key = available_keys[attempts % len(available_keys)] @@ -444,28 +483,29 @@ def get_valid_llm_response(self, prompt: str, validator, context: str = "") -> s f"Attempting Gemini API Key {attempts % len(available_keys) + 1}/{len(available_keys)}" ) - if use_ollama: - logger.info("Using Local Ollama Engine...") - response_text = self._stream_single_llm(prompt, key=key, context=context) - if is_cloud and ( - response_text.startswith("ERROR_CODE_") or not response_text.strip() - ): - if "429" in response_text and key: + # Handle errors/rate-limits + if response_text.startswith("ERROR_CODE_429"): + if key: self.key_cooldowns[key] = time.time() + 1200 - logger.warning("⚠️ Key rate-limited. Rotating...") - else: - logger.warning( - "⚠️ Gemini error/empty response. Sleeping 10s before retry..." - ) - time.sleep(10) attempts += 1 continue - if response_text.startswith("ERROR_CODE_429"): - if key: - self.key_cooldowns[key] = time.time() + 1200 + # If Gemini fails/returns empty in the cloud, perform Smart Sleep + if is_cloud and ( + response_text.startswith("ERROR_CODE_") or not response_text.strip() + ): + wait_times = [ + cooldown - now for cooldown in self.key_cooldowns.values() + ] + sleep_duration = max( + 10, min(min(wait_times) if wait_times else 60, 600) + ) + logger.warning( + f"⏳ Cloud limit reached. Resuming in {int(sleep_duration)}s..." + ) + time.sleep(sleep_duration) attempts += 1 continue @@ -474,6 +514,7 @@ def get_valid_llm_response(self, prompt: str, validator, context: str = "") -> s continue if validator(response_text): + # --- SUCCESS BREATHER --- if is_cloud: time.sleep(2) return response_text