From 65d4ac7ce6cd2ee63865f10a9cb35c7f235737fe Mon Sep 17 00:00:00 2001
From: Vic <125237471+vicsanity623@users.noreply.github.com>
Date: Mon, 9 Mar 2026 16:58:19 -0700
Subject: [PATCH 1/3] Update core_utils.py

---
 src/pyob/core_utils.py | 109 ++++++++++++++++++++++++-----------------
 1 file changed, 65 insertions(+), 44 deletions(-)

diff --git a/src/pyob/core_utils.py b/src/pyob/core_utils.py
index 12e2d75..0552887 100644
--- a/src/pyob/core_utils.py
+++ b/src/pyob/core_utils.py
@@ -342,6 +342,46 @@ def stream_ollama(self, prompt: str, on_chunk) -> str:
         except Exception as e:
             logger.error(f"Ollama Error: {e}")
         return response_text
+        
+    def stream_github_models(self, prompt: str, on_chunk) -> str:
+        """Fallback to GitHub Models API (Phi-4)."""
+        token = os.environ.get("GITHUB_TOKEN")
+        if not token:
+            return "ERROR_CODE_GITHUB_TOKEN_MISSING"
+
+        # GitHub Models use the Azure AI Inference endpoint
+        endpoint = "https://models.inference.ai.azure.com/chat/completions"
+        headers = {
+            "Authorization": f"Bearer {token}",
+            "Content-Type": "application/json"
+        }
+        data = {
+            "messages": [{"role": "user", "content": prompt}],
+            "model": "Phi-4",
+            "stream": True,
+            "temperature": 0.1
+        }
+
+        full_text = ""
+        try:
+            response = requests.post(endpoint, headers=headers, json=data, stream=True, timeout=120)
+            if response.status_code != 200:
+                return f"ERROR_CODE_{response.status_code}"
+
+            for line in response.iter_lines():
+                if line:
+                    decoded_line = line.decode("utf-8").replace("data: ", "")
+                    if decoded_line == "[DONE]": break
+                    try:
+                        chunk = json.loads(decoded_line)
+                        content = chunk["choices"][0]["delta"].get("content", "")
+                        if content:
+                            full_text += content
+                            on_chunk() 
+                    except: continue
+            return full_text
+        except Exception as e:
+            return f"ERROR_CODE_EXCEPTION: {e}"
 
     def _stream_single_llm(
         self, prompt: str, key: str | None = None, context: str = ""
@@ -385,10 +425,10 @@ def on_chunk():
                 response_text = self.stream_gemini(prompt, key, on_chunk)
             else:
                 if os.environ.get("GITHUB_ACTIONS") == "true":
-                    first_chunk_received[0] = True
-                    return "ERROR_CODE_CLOUD_OLLAMA_BLOCKED"
-
-                response_text = self.stream_ollama(prompt, on_chunk)
+                    logger.info("☁️ Gemini limited. Pivoting to GitHub Models (Phi-4)...")
+                    response_text = self.stream_github_models(prompt, on_chunk)
+                else:
+                    response_text = self.stream_ollama(prompt, on_chunk)
         except Exception as e:
             first_chunk_received[0] = True
             return f"ERROR_CODE_EXCEPTION: {e}"
@@ -408,9 +448,7 @@ def get_valid_llm_response(self, prompt: str, validator, context: str = "") -> s
         use_ollama = False
         is_cloud = os.environ.get("GITHUB_ACTIONS") == "true"
 
-        logger.info(
-            f"📊 Engine check: Found {len(self.key_cooldowns)} Gemini API keys."
-        )
+        logger.info(f"📊 Engine check: Found {len(self.key_cooldowns)} Gemini API keys.")
 
         while True:
             key = None
@@ -418,54 +456,36 @@ def get_valid_llm_response(self, prompt: str, validator, context: str = "") -> s
             available_keys = [
                 k for k, cooldown in self.key_cooldowns.items() if now > cooldown
             ]
+            
             if not available_keys:
                 if is_cloud:
-                    wait_times = [
-                        cooldown - now for cooldown in self.key_cooldowns.values()
-                    ]
-                    sleep_duration = max(
-                        10, min(min(wait_times) if wait_times else 120, 1200)
-                    )
-                    logger.warning(
-                        f"⏳ CLOUD NOTICE: All keys rate-limited. Retrying Gemini in {int(sleep_duration)}s..."
-                    )
-                    time.sleep(sleep_duration)
-                    continue
-
-                if not use_ollama:
-                    logger.warning(
-                        "🚫 All Gemini keys rate-limited. Falling back to Local Ollama."
-                    )
-                    use_ollama = True
+                    # In the cloud, we don't 'use_ollama', we just try the GitHub Models fallback
+                    # which is handled inside _stream_single_llm(key=None)
+                    use_ollama = False 
+                else:
+                    if not use_ollama:
+                        logger.warning("🚫 Gemini keys limited. Falling back to Local Ollama.")
+                        use_ollama = True
             else:
                 use_ollama = False
                 key = available_keys[attempts % len(available_keys)]
-                logger.info(
-                    f"Attempting Gemini API Key {attempts % len(available_keys) + 1}/{len(available_keys)}"
-                )
-
-            if use_ollama:
-                logger.info("Using Local Ollama Engine...")
+                logger.info(f"Attempting Gemini API Key {attempts % len(available_keys) + 1}/{len(available_keys)}")
 
             response_text = self._stream_single_llm(prompt, key=key, context=context)
 
-            if is_cloud and (
-                response_text.startswith("ERROR_CODE_") or not response_text.strip()
-            ):
-                if "429" in response_text and key:
+            # Handle errors/rate-limits
+            if response_text.startswith("ERROR_CODE_429"):
+                if key:
                     self.key_cooldowns[key] = time.time() + 1200
-                    logger.warning("⚠️ Key rate-limited. Rotating...")
-                else:
-                    logger.warning(
-                        "⚠️ Gemini error/empty response. Sleeping 10s before retry..."
-                    )
-                    time.sleep(10)
                 attempts += 1
                 continue
 
-            if response_text.startswith("ERROR_CODE_429"):
-                if key:
-                    self.key_cooldowns[key] = time.time() + 1200
+            # If Gemini fails/returns empty in the cloud, perform Smart Sleep
+            if is_cloud and (response_text.startswith("ERROR_CODE_") or not response_text.strip()):
+                wait_times = [cooldown - now for cooldown in self.key_cooldowns.values()]
+                sleep_duration = max(10, min(min(wait_times) if wait_times else 60, 600))
+                logger.warning(f"⏳ Cloud limit reached. Resuming in {int(sleep_duration)}s...")
+                time.sleep(sleep_duration)
                 attempts += 1
                 continue
 
@@ -474,8 +494,9 @@ def get_valid_llm_response(self, prompt: str, validator, context: str = "") -> s
                 continue
 
             if validator(response_text):
+                # --- SUCCESS BREATHER ---
                 if is_cloud:
-                    time.sleep(2)
+                    time.sleep(2) 
                 return response_text
             else:
                 logger.warning("LLM response failed validation. Retrying...")

From 3c3cf2aa1529065cc91e443ce160506a665a5ebd Mon Sep 17 00:00:00 2001
From: Vic <125237471+vicsanity623@users.noreply.github.com>
Date: Mon, 9 Mar 2026 17:07:25 -0700
Subject: [PATCH 2/3] Update core_utils.py

---
 src/pyob/core_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/pyob/core_utils.py b/src/pyob/core_utils.py
index 0552887..57e0963 100644
--- a/src/pyob/core_utils.py
+++ b/src/pyob/core_utils.py
@@ -371,14 +371,16 @@ def stream_github_models(self, prompt: str, on_chunk) -> str:
             for line in response.iter_lines():
                 if line:
                     decoded_line = line.decode("utf-8").replace("data: ", "")
-                    if decoded_line == "[DONE]": break
+                    if decoded_line == "[DONE]":
+                        break
                     try:
                         chunk = json.loads(decoded_line)
                         content = chunk["choices"][0]["delta"].get("content", "")
                         if content:
                             full_text += content
                             on_chunk() 
-                    except: continue
+                    except Exception:
+                        continue
             return full_text
         except Exception as e:
             return f"ERROR_CODE_EXCEPTION: {e}"

From dbf0639bc166325ff3da334c51842bf32aa3d2d9 Mon Sep 17 00:00:00 2001
From: vicsanity623 <125237471+vicsanity623@users.noreply.github.com>
Date: Tue, 10 Mar 2026 00:07:49 +0000
Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=AA=84=20PyOB:=20Automated=20Lint=20&?=
 =?UTF-8?q?=20Format=20Fixes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/pyob/core_utils.py | 50 ++++++++++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/src/pyob/core_utils.py b/src/pyob/core_utils.py
index 57e0963..9939b2b 100644
--- a/src/pyob/core_utils.py
+++ b/src/pyob/core_utils.py
@@ -342,7 +342,7 @@ def stream_ollama(self, prompt: str, on_chunk) -> str:
         except Exception as e:
             logger.error(f"Ollama Error: {e}")
         return response_text
-        
+
     def stream_github_models(self, prompt: str, on_chunk) -> str:
         """Fallback to GitHub Models API (Phi-4)."""
         token = os.environ.get("GITHUB_TOKEN")
@@ -353,18 +353,20 @@ def stream_github_models(self, prompt: str, on_chunk) -> str:
         endpoint = "https://models.inference.ai.azure.com/chat/completions"
         headers = {
             "Authorization": f"Bearer {token}",
-            "Content-Type": "application/json"
+            "Content-Type": "application/json",
         }
         data = {
             "messages": [{"role": "user", "content": prompt}],
             "model": "Phi-4",
             "stream": True,
-            "temperature": 0.1
+            "temperature": 0.1,
         }
 
         full_text = ""
         try:
-            response = requests.post(endpoint, headers=headers, json=data, stream=True, timeout=120)
+            response = requests.post(
+                endpoint, headers=headers, json=data, stream=True, timeout=120
+            )
             if response.status_code != 200:
                 return f"ERROR_CODE_{response.status_code}"
 
@@ -378,7 +380,7 @@ def stream_github_models(self, prompt: str, on_chunk) -> str:
                         content = chunk["choices"][0]["delta"].get("content", "")
                         if content:
                             full_text += content
-                            on_chunk() 
+                            on_chunk()
                     except Exception:
                         continue
             return full_text
@@ -427,7 +429,9 @@ def on_chunk():
                 response_text = self.stream_gemini(prompt, key, on_chunk)
             else:
                 if os.environ.get("GITHUB_ACTIONS") == "true":
-                    logger.info("☁️ Gemini limited. Pivoting to GitHub Models (Phi-4)...")
+                    logger.info(
+                        "☁️ Gemini limited. Pivoting to GitHub Models (Phi-4)..."
+                    )
                     response_text = self.stream_github_models(prompt, on_chunk)
                 else:
                     response_text = self.stream_ollama(prompt, on_chunk)
@@ -450,7 +454,9 @@ def get_valid_llm_response(self, prompt: str, validator, context: str = "") -> s
         use_ollama = False
         is_cloud = os.environ.get("GITHUB_ACTIONS") == "true"
 
-        logger.info(f"📊 Engine check: Found {len(self.key_cooldowns)} Gemini API keys.")
+        logger.info(
+            f"📊 Engine check: Found {len(self.key_cooldowns)} Gemini API keys."
+        )
 
         while True:
             key = None
@@ -458,20 +464,24 @@ def get_valid_llm_response(self, prompt: str, validator, context: str = "") -> s
             available_keys = [
                 k for k, cooldown in self.key_cooldowns.items() if now > cooldown
             ]
-            
+
             if not available_keys:
                 if is_cloud:
                     # In the cloud, we don't 'use_ollama', we just try the GitHub Models fallback
                     # which is handled inside _stream_single_llm(key=None)
-                    use_ollama = False 
+                    use_ollama = False
                 else:
                     if not use_ollama:
-                        logger.warning("🚫 Gemini keys limited. Falling back to Local Ollama.")
+                        logger.warning(
+                            "🚫 Gemini keys limited. Falling back to Local Ollama."
+                        )
                         use_ollama = True
             else:
                 use_ollama = False
                 key = available_keys[attempts % len(available_keys)]
-                logger.info(f"Attempting Gemini API Key {attempts % len(available_keys) + 1}/{len(available_keys)}")
+                logger.info(
+                    f"Attempting Gemini API Key {attempts % len(available_keys) + 1}/{len(available_keys)}"
+                )
 
             response_text = self._stream_single_llm(prompt, key=key, context=context)
 
@@ -483,10 +493,18 @@ def get_valid_llm_response(self, prompt: str, validator, context: str = "") -> s
                 continue
 
             # If Gemini fails/returns empty in the cloud, perform Smart Sleep
-            if is_cloud and (response_text.startswith("ERROR_CODE_") or not response_text.strip()):
-                wait_times = [cooldown - now for cooldown in self.key_cooldowns.values()]
-                sleep_duration = max(10, min(min(wait_times) if wait_times else 60, 600))
-                logger.warning(f"⏳ Cloud limit reached. Resuming in {int(sleep_duration)}s...")
+            if is_cloud and (
+                response_text.startswith("ERROR_CODE_") or not response_text.strip()
+            ):
+                wait_times = [
+                    cooldown - now for cooldown in self.key_cooldowns.values()
+                ]
+                sleep_duration = max(
+                    10, min(min(wait_times) if wait_times else 60, 600)
+                )
+                logger.warning(
+                    f"⏳ Cloud limit reached. Resuming in {int(sleep_duration)}s..."
+                )
                 time.sleep(sleep_duration)
                 attempts += 1
                 continue
@@ -498,7 +516,7 @@ def get_valid_llm_response(self, prompt: str, validator, context: str = "") -> s
             if validator(response_text):
                 # --- SUCCESS BREATHER ---
                 if is_cloud:
-                    time.sleep(2) 
+                    time.sleep(2)
                 return response_text
             else:
                 logger.warning("LLM response failed validation. Retrying...")