-
Notifications
You must be signed in to change notification settings - Fork 114
feat: add stream_validate() hook to Requirement (#900) #925
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
planetf1
merged 8 commits into
generative-computing:main
from
planetf1:feat/900-stream-validate
Apr 28, 2026
+209
−0
Merged
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
7488761
feat: add stream_validate() hook to Requirement (#900)
planetf1 3054f6f
fix(core): address PR #925 review feedback on stream_validate
planetf1 358e4d1
fix(core): make stream_validate backend/ctx keyword-only
planetf1 efded18
fix(core): fix stream_validate docstring and add missing stateful tests
planetf1 315a98c
test(core): make BulletCounter genuinely stateful via delta extraction
planetf1 2f3c423
test(core): fix missing type: ignore on backend=None in multi-line call
planetf1 09ef1a9
test(core): fix import path and clone test orchestrator pattern
planetf1 82bdd3a
fix(core): stream_validate receives a single chunk, not accumulated text
planetf1 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,164 @@ | ||
| """Unit tests for Requirement.stream_validate() hook.""" | ||
|
|
||
| import inspect | ||
| from copy import copy | ||
|
|
||
| import pytest | ||
|
|
||
| from mellea.core import Backend, Context, PartialValidationResult, Requirement | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_default_returns_unknown(): | ||
| req = Requirement(description="some requirement") | ||
| result = await req.stream_validate("some chunk", backend=None, ctx=None) # type: ignore[arg-type] | ||
| assert result.success == "unknown" | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_default_returns_partial_validation_result_instance(): | ||
| req = Requirement() | ||
| result = await req.stream_validate("chunk", backend=None, ctx=None) # type: ignore[arg-type] | ||
| assert isinstance(result, PartialValidationResult) | ||
|
|
||
|
|
||
| def test_stream_validate_is_coroutine(): | ||
| req = Requirement() | ||
| assert inspect.iscoroutinefunction(req.stream_validate) | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_subclass_can_return_pass(): | ||
| class PassRequirement(Requirement): | ||
| async def stream_validate( | ||
| self, chunk: str, *, backend: Backend, ctx: Context | ||
| ) -> PartialValidationResult: | ||
| return PartialValidationResult("pass") | ||
|
|
||
| req = PassRequirement(description="always passes") | ||
| result = await req.stream_validate("any chunk", backend=None, ctx=None) # type: ignore[arg-type] | ||
| assert result.success == "pass" | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_subclass_can_return_fail(): | ||
| class FailRequirement(Requirement): | ||
| async def stream_validate( | ||
| self, chunk: str, *, backend: Backend, ctx: Context | ||
| ) -> PartialValidationResult: | ||
| if "bad" in chunk: | ||
| return PartialValidationResult("fail", reason="bad word detected") | ||
| return PartialValidationResult("unknown") | ||
|
|
||
| req = FailRequirement(description="no bad words") | ||
| result = await req.stream_validate("this is bad content", backend=None, ctx=None) # type: ignore[arg-type] | ||
| assert result.success == "fail" | ||
| assert result.reason == "bad word detected" | ||
|
|
||
| result_unknown = await req.stream_validate("good content", backend=None, ctx=None) # type: ignore[arg-type] | ||
| assert result_unknown.success == "unknown" | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_does_not_mutate_requirement(): | ||
| req = Requirement(description="original description") | ||
| original_description = req.description | ||
| original_output = req._output | ||
| original_validation_fn = req.validation_fn | ||
|
|
||
| await req.stream_validate("some chunk", backend=None, ctx=None) # type: ignore[arg-type] | ||
|
|
||
| assert req.description == original_description | ||
| assert req._output == original_output | ||
| assert req.validation_fn == original_validation_fn | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_stream_validate_idempotent(): | ||
| req = Requirement(description="repeated calls") | ||
| result1 = await req.stream_validate("chunk one", backend=None, ctx=None) # type: ignore[arg-type] | ||
| result2 = await req.stream_validate("chunk two", backend=None, ctx=None) # type: ignore[arg-type] | ||
| assert result1.success == "unknown" | ||
| assert result2.success == "unknown" | ||
| assert req._output is None | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_stateful_subclass_accumulates_state(): | ||
| """Stateful subclass correctly accumulates state across stream_validate calls. | ||
|
|
||
| Each call receives a single chunk (the delta produced by the chunking | ||
| strategy). Requirements maintain their own running state across calls | ||
| rather than re-scanning accumulated text. | ||
| """ | ||
|
|
||
| class BulletCounter(Requirement): | ||
| def __init__(self) -> None: | ||
| super().__init__(description="no more than 3 bullets") | ||
| self._bullet_count = 0 | ||
|
|
||
| async def stream_validate( | ||
| self, chunk: str, *, backend: Backend, ctx: Context | ||
| ) -> PartialValidationResult: | ||
| self._bullet_count += chunk.count("\n-") | ||
| if self._bullet_count > 3: | ||
| return PartialValidationResult( | ||
| "fail", reason=f"{self._bullet_count} bullets exceeds limit" | ||
| ) | ||
| return PartialValidationResult("unknown") | ||
|
|
||
| req = BulletCounter() | ||
| assert req._bullet_count == 0 | ||
|
|
||
| await req.stream_validate("intro text", backend=None, ctx=None) # type: ignore[arg-type] | ||
| assert req._bullet_count == 0 | ||
|
|
||
| await req.stream_validate("\n- one\n- two", backend=None, ctx=None) # type: ignore[arg-type] | ||
| assert req._bullet_count == 2 | ||
|
|
||
| result = await req.stream_validate( | ||
| "\n- three\n- four", | ||
| backend=None, # type: ignore[arg-type] | ||
| ctx=None, # type: ignore[arg-type] | ||
| ) | ||
| assert req._bullet_count == 4 | ||
| assert result.success == "fail" | ||
| assert result.reason is not None and "4" in result.reason | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_stateful_subclass_clone_isolation(): | ||
| """Orchestrator clone pattern: copy() before each attempt gives a fresh independent clone. | ||
|
|
||
| The orchestrator holds the original requirement and never calls stream_validate on it | ||
| directly. Before each attempt it clones the original; each clone starts from the | ||
| original's (zero) state and advances independently. | ||
| """ | ||
|
|
||
| class CallCounter(Requirement): | ||
| def __init__(self) -> None: | ||
| super().__init__(description="call counter") | ||
| self._calls = 0 | ||
|
|
||
| async def stream_validate( | ||
| self, chunk: str, *, backend: Backend, ctx: Context | ||
| ) -> PartialValidationResult: | ||
| self._calls += 1 | ||
| return PartialValidationResult("unknown") | ||
|
|
||
| req = CallCounter() # original — never used directly by the orchestrator | ||
|
|
||
| # Attempt 1 | ||
| attempt1 = copy(req) | ||
| assert attempt1._calls == 0 | ||
| await attempt1.stream_validate("a", backend=None, ctx=None) # type: ignore[arg-type] | ||
| await attempt1.stream_validate("b", backend=None, ctx=None) # type: ignore[arg-type] | ||
| assert attempt1._calls == 2 | ||
|
|
||
| # Attempt 2 (retry) — fresh clone from the same original | ||
| attempt2 = copy(req) | ||
| assert attempt2._calls == 0 # starts clean, not carrying attempt1's state | ||
| await attempt2.stream_validate("c", backend=None, ctx=None) # type: ignore[arg-type] | ||
| assert attempt2._calls == 1 | ||
|
|
||
| assert req._calls == 0 # original never mutated |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Clarification, the generation context has the uncomputed mot at this point, right? That's the reason for the warning?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes - they need to rely on the chunk & anything they've accumulated themselves. mot_is_computed stays false until streaming ends so we can't really say for sure what's in it. Tried to ensure we capture the behaviour in the docstrings
I think any changes to this initial approach -- and making the mot more responsible -- is a later phase.
Thanks for approval!
I have a stacked PR which I'll get out asap too (may not be until tomorrow)