diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 832088ec37..5e5dde5db3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -119,8 +119,6 @@ jobs: ./bin/lint.sh - name: Type check - env: - PYTHON_VERSION: ${{ matrix.python-version }} run: | source pygraphistry/bin/activate ./bin/typecheck.sh diff --git a/.gitignore b/.gitignore index 2bd235d015..35704d511b 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,8 @@ pypirc # Distribution / packaging .Python env/ +.venv/ +.hypothesis/ build/ develop-eggs/ dist/ diff --git a/CHANGELOG.md b/CHANGELOG.md index c1e7e16a73..12961915bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,12 +8,22 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Development] +### Added +- **Compute / hop**: `hop()` supports `min_hops`/`max_hops` traversal bounds plus optional hop labels for nodes, edges, and seeds, and post-traversal slicing via `output_min_hops`/`output_max_hops` to keep outputs compact while traversing wider ranges. +- **Docs / hop**: Added bounded-hop walkthrough notebook (`docs/source/gfql/hop_bounds.ipynb`), cheatsheet and GFQL spec updates, and examples showing how to combine hop ranges, labels, and output slicing. +- **GFQL / reference**: Extended the pandas reference enumerator and parity tests to cover hop ranges, labeling, and slicing so GFQL correctness checks include the new traversal shapes. ### Fixed -- **GFQL:** `Chain` now validates on construction (matching docs) and rejects invalid hops immediately; pass `validate=False` to defer validation when assembling advanced flows (fixes #860). +- **Compute / hop**: Exact-hop traversals now prune branches that do not reach `min_hops`, avoid reapplying min-hop pruning in reverse passes, keep seeds in wavefront outputs, and reuse forward wavefronts when recomputing labels so edge/node hop labels stay aligned (fixes 3-hop branch inclusion issues and mislabeled slices). +- **GFQL**: `Chain` now validates on construction (matching docs) and rejects invalid hops immediately; pass `validate=False` to defer validation when assembling advanced flows (fixes #860). - **GFQL / eq:** `eq()` now accepts strings in addition to numeric/temporal values (use `isna()`/`notna()` for nulls); added coverage across validator, schema validation, JSON, and GFQL runtime (fixes #862). - ### Docs -- **GFQL validation:** Clarified `Chain` constructor validation defaults, `validate=False` defer option, validation phases, and guidance for large/nested ASTs to reduce redundant validation (issue #860). +- **GFQL validation**: Clarified `Chain` constructor validation defaults, `validate=False` defer option, validation phases, and guidance for large/nested ASTs to reduce redundant validation (issue #860). +### Tests +- **GFQL / hop**: Expanded `test_compute_hops.py` and GFQL parity suites to assert branch pruning, bounded outputs, label collision handling, and forward/reverse slice behavior. +- **Reference enumerator**: Added oracle parity tests for hop ranges and output slices to guard GFQL integrations. +### Infra +- **CI / docs-only**: Skip Python lint/type/test matrices when the tip commit only touches docs/markdown/notebooks, and warm a shared HuggingFace cache with pinned CPU torch for AI jobs to avoid HF throttling. +- **Tooling**: `bin/lint.sh` / `bin/typecheck.sh` resolve interpreters consistently (uv → python -m → bare) without forcing PYTHON_VERSION, keeping developer and CI runs aligned. ## [0.46.0 - 2025-12-01] @@ -39,6 +49,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ### Tests - **CI / Python**: Expand GitHub Actions coverage to Python 3.13 + 3.13/3.14 for CPU lint/type/test jobs, while pinning RAPIDS-dependent CPU/GPU suites to <=3.13 until NVIDIA publishes 3.14 wheels (ensures lint/mypy/pytest signal on the latest interpreter without breaking RAPIDS installs). +- **CI / Lint & Types**: Split lint/type runners from resolution: `bin/flake8.sh`/`bin/mypy.sh` are now minimal and respect `FLAKE8_CMD`/`MYPY_CMD`; `bin/lint.sh`/`bin/typecheck.sh` resolve `uvx` → `python -m …` → bare tool, relying on `mypy.ini`’s interpreter default instead of forcing 3.8. CI still uses `.[test]` installs and the matrix interpreters. - **GFQL**: Added deterministic + property-based oracle tests (triangles, alias reuse, cuDF conversions, Hypothesis) plus parity checks ensuring pandas GFQL chains match the oracle outputs. - **Layouts**: Added comprehensive test coverage for `circle_layout()` and `group_in_a_box_layout()` with partition support (CPU/GPU) diff --git a/README.md b/README.md index a94c4731b5..72939e89b0 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,6 @@ [![Uptime Robot status](https://img.shields.io/uptimerobot/status/m787548531-e9c7b7508fc76fea927e2313?label=hub.graphistry.com)](https://status.graphistry.com/) [](https://join.slack.com/t/graphistry-community/shared_invite/zt-53ik36w2-fpP0Ibjbk7IJuVFIRSnr6g) [![Twitter Follow](https://img.shields.io/twitter/follow/graphistry)](https://twitter.com/graphistry) -
@@ -34,7 +33,6 @@ PyGraphistry is an open source Python library for data scientists and developers *[View all connectors →](https://pygraphistry.readthedocs.io/en/latest/notebooks/plugins.connectors.html)* - * [**Prototype locally and deploy remotely:**](https://www.graphistry.com/get-started) Prototype from notebooks like Jupyter and Databricks using local CPUs & GPUs, and then power production dashboards & pipelines with Graphistry Hub and your own self-hosted servers. * [**Query graphs with GFQL:**](https://pygraphistry.readthedocs.io/en/latest/gfql/index.html) Use GFQL, the first dataframe-native graph query language, to ask relationship questions that are difficult for tabular tools and without requiring a database. @@ -45,14 +43,10 @@ PyGraphistry is an open source Python library for data scientists and developers * [**Columnar & GPU acceleration:**](https://pygraphistry.readthedocs.io/en/latest/performance.html) CPU-mode ingestion and wrangling is fast due to native use of Apache Arrow and columnar analytics, and the optional RAPIDS-based GPU mode delivers 100X+ speedups. - From global 10 banks, manufacturers, news agencies, and government agencies, to startups, game companies, scientists, biotechs, and NGOs, many teams are tackling their graph workloads with Graphistry. - - ## Gallery - The [notebook demo gallery](https://pygraphistry.readthedocs.io/en/latest/demos/for_analysis.html) shares many more live visualizations, demos, and integration examples @@ -68,8 +62,6 @@ The [notebook demo gallery](https://pygraphistry.readthedocs.io/en/latest/demos/
- - ## Install Common configurations: @@ -98,7 +90,6 @@ Common configurations: For further options, see the [installation guides](https://pygraphistry.readthedocs.io/en/latest/install/index.html) - ## Visualization quickstart Quickly go from raw data to a styled and interactive Graphistry graph visualization: @@ -129,7 +120,6 @@ g1_styled.plot() Explore [10 Minutes to Graphistry Visualization](https://pygraphistry.readthedocs.io/en/latest/visualization/10min.html) for more visualization examples and options - ## PyGraphistry[AI] & GFQL quickstart - CPU & GPU **CPU graph pipeline** combining graph ML, AI, mining, and visualization: @@ -175,7 +165,6 @@ g4.plot() Explore [10 Minutes to PyGraphistry](https://pygraphistry.readthedocs.io/en/latest/10min.html) for a wider variety of graph processing. - ## PyGraphistry documentation * [Main PyGraphistry documentation](https://pygraphistry.readthedocs.io/en/latest/) @@ -183,37 +172,35 @@ Explore [10 Minutes to PyGraphistry](https://pygraphistry.readthedocs.io/en/late * Get started: [Install](https://pygraphistry.readthedocs.io/en/latest/install/index.html), [UI Guide](https://hub.graphistry.com/docs/ui/index/), [Notebooks](https://pygraphistry.readthedocs.io/en/latest/demos/for_analysis.html) * Performance: [PyGraphistry CPU+GPU](https://pygraphistry.readthedocs.io/en/latest/performance.html) & [GFQL CPU+GPU](https://pygraphistry.readthedocs.io/en/latest/gfql/performance.html) * API References - - [PyGraphistry API Reference](https://pygraphistry.readthedocs.io/en/latest/api/index.html): [Visualization & Compute](https://pygraphistry.readthedocs.io/en/latest/visualization/index.html), [PyGraphistry Cheatsheet](https://pygraphistry.readthedocs.io/en/latest/cheatsheet.html) - - [GFQL Documentation](https://pygraphistry.readthedocs.io/en/latest/gfql/index.html): [GFQL Cheatsheet](https://pygraphistry.readthedocs.io/en/latest/gfql/quick.html) and [GFQL Operator Cheatsheet](https://pygraphistry.readthedocs.io/en/latest/gfql/predicates/quick.html) - - [Plugins](https://pygraphistry.readthedocs.io/en/latest/plugins.html): Databricks, Splunk, Neptune, Neo4j, RAPIDS, and more - - Web: [iframe](https://hub.graphistry.com/docs/api/1/rest/url/#urloptions), [JavaScript](https://hub.graphistry.com/static/js-docs/index.html?path=/docs/introduction--docs), [REST](https://hub.graphistry.com/docs/api/1/rest/auth/) + * [PyGraphistry API Reference](https://pygraphistry.readthedocs.io/en/latest/api/index.html): [Visualization & Compute](https://pygraphistry.readthedocs.io/en/latest/visualization/index.html), [PyGraphistry Cheatsheet](https://pygraphistry.readthedocs.io/en/latest/cheatsheet.html) + * [GFQL Documentation](https://pygraphistry.readthedocs.io/en/latest/gfql/index.html): [GFQL Cheatsheet](https://pygraphistry.readthedocs.io/en/latest/gfql/quick.html) and [GFQL Operator Cheatsheet](https://pygraphistry.readthedocs.io/en/latest/gfql/predicates/quick.html) + * [Plugins](https://pygraphistry.readthedocs.io/en/latest/plugins.html): Databricks, Splunk, Neptune, Neo4j, RAPIDS, and more + * Web: [iframe](https://hub.graphistry.com/docs/api/1/rest/url/#urloptions), [JavaScript](https://hub.graphistry.com/static/js-docs/index.html?path=/docs/introduction--docs), [REST](https://hub.graphistry.com/docs/api/1/rest/auth/) ## Graphistry ecosystem -- **Graphistry server:** - - Launch - [Graphistry Hub, Graphistry cloud marketplaces, and self-hosting](https://www.graphistry.com/get-started) - - Self-hosting: [Administration (including Docker)](https://github.com/graphistry/graphistry-cli) & [Kubernetes](https://github.com/graphistry/graphistry-helm) - -- **Graphistry client APIs:** - - Web: [iframe](https://hub.graphistry.com/docs/api/1/rest/url/#urloptions), [JavaScript](https://hub.graphistry.com/static/js-docs/index.html?path=/docs/introduction--docs), [REST](https://hub.graphistry.com/docs/api/1/rest/auth/) - - [PyGraphistry](https://pygraphistry.readthedocs.io/en/latest/index.html) - - [Graphistry for Microsoft PowerBI](https://hub.graphistry.com/docs/powerbi/pbi/) +* **Graphistry server:** + * Launch - [Graphistry Hub, Graphistry cloud marketplaces, and self-hosting](https://www.graphistry.com/get-started) + * Self-hosting: [Administration (including Docker)](https://github.com/graphistry/graphistry-cli) & [Kubernetes](https://github.com/graphistry/graphistry-helm) -- **Additional projects**: - - [Louie.ai](https://louie.ai/): GenAI-native notebooks & dashboards to talk to your databases & Graphistry - - [graph-app-kit](https://github.com/graphistry/graph-app-kit): Streamlit Python dashboards with batteries-include graph packages - - [cu-cat](https://chat.openai.com/chat): Automatic GPU feature engineering +* **Graphistry client APIs:** + * Web: [iframe](https://hub.graphistry.com/docs/api/1/rest/url/#urloptions), [JavaScript](https://hub.graphistry.com/static/js-docs/index.html?path=/docs/introduction--docs), [REST](https://hub.graphistry.com/docs/api/1/rest/auth/) + * [PyGraphistry](https://pygraphistry.readthedocs.io/en/latest/index.html) + * [Graphistry for Microsoft PowerBI](https://hub.graphistry.com/docs/powerbi/pbi/) +* **Additional projects**: + * [Louie.ai](https://louie.ai/): GenAI-native notebooks & dashboards to talk to your databases & Graphistry + * [graph-app-kit](https://github.com/graphistry/graph-app-kit): Streamlit Python dashboards with batteries-include graph packages + * [cu-cat](https://chat.openai.com/chat): Automatic GPU feature engineering ## Community and support -- [Blog](https://www.graphistry.com/blog) for tutorials, case studies, and updates -- [Slack](https://join.slack.com/t/graphistry-community/shared_invite/zt-53ik36w2-fpP0Ibjbk7IJuVFIRSnr6g): Join the Graphistry Community Slack for discussions and support -- [Twitter](https://twitter.com/graphistry) & [LinkedIn](https://www.linkedin.com/company/graphistry): Follow for updates -- [GitHub Issues](https://github.com/graphistry/pygraphistry/issues) open source support -- [Graphistry ZenDesk](https://graphistry.zendesk.com/) dedicated enterprise support +* [Blog](https://www.graphistry.com/blog) for tutorials, case studies, and updates +* [Slack](https://join.slack.com/t/graphistry-community/shared_invite/zt-53ik36w2-fpP0Ibjbk7IJuVFIRSnr6g): Join the Graphistry Community Slack for discussions and support +* [Twitter](https://twitter.com/graphistry) & [LinkedIn](https://www.linkedin.com/company/graphistry): Follow for updates +* [GitHub Issues](https://github.com/graphistry/pygraphistry/issues) open source support +* [Graphistry ZenDesk](https://graphistry.zendesk.com/) dedicated enterprise support ## Contribute See [CONTRIBUTING](https://pygraphistry.readthedocs.io/en/latest/CONTRIBUTING.html) and [DEVELOP](https://pygraphistry.readthedocs.io/en/latest/DEVELOP.html) for participating in PyGraphistry development, or reach out to our team - diff --git a/bin/flake8.sh b/bin/flake8.sh index 50d144b07b..bf75a8f421 100755 --- a/bin/flake8.sh +++ b/bin/flake8.sh @@ -1,29 +1,14 @@ #!/bin/bash set -e -# Try to find the best Python version available -PYTHON_BIN="" -for ver in 3.14 3.13 3.12 3.11 3.10 3.9 3.8; do - if command -v python$ver &> /dev/null; then - PYTHON_BIN=python$ver - break - fi -done +# Minimal resolution: env override or host flake8 +FLAKE8_CMD_ARR=(${FLAKE8_CMD:-flake8}) -if [ -z "$PYTHON_BIN" ]; then - echo "No suitable Python version found (3.8-3.14)" +if ! "${FLAKE8_CMD_ARR[@]}" --version &> /dev/null; then + echo "flake8 not found. Set FLAKE8_CMD or install flake8 on PATH." exit 1 fi -echo "Using Python: $PYTHON_BIN" -$PYTHON_BIN --version - -# Install flake8 if not available -if ! $PYTHON_BIN -m flake8 --version &> /dev/null; then - echo "Installing flake8..." - $PYTHON_BIN -m pip install flake8 --user -fi - # Get the script directory and repo root SCRIPT_DIR="$( cd "$( dirname -- "${BASH_SOURCE[0]}" )" && pwd )" REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" @@ -44,7 +29,7 @@ echo "Running flake8 on: $TARGET" # Quick syntax error check echo "=== Running quick syntax check ===" -$PYTHON_BIN -m flake8 \ +"${FLAKE8_CMD_ARR[@]}" \ $TARGET \ --count \ --select=E9,F63,F7,F82 \ @@ -53,7 +38,7 @@ $PYTHON_BIN -m flake8 \ # Full lint check echo "=== Running full lint check ===" -$PYTHON_BIN -m flake8 \ +"${FLAKE8_CMD_ARR[@]}" \ $TARGET \ --exclude=graphistry/graph_vector_pb2.py,graphistry/_version.py \ --count \ @@ -62,4 +47,4 @@ $PYTHON_BIN -m flake8 \ --max-line-length=127 \ --statistics -echo "Flake8 check completed successfully!" \ No newline at end of file +echo "Flake8 check completed successfully!" diff --git a/bin/lint.sh b/bin/lint.sh index 68610fd4cf..cc99fd04ae 100755 --- a/bin/lint.sh +++ b/bin/lint.sh @@ -4,25 +4,15 @@ set -ex # Run from project root # Non-zero exit code on fail -flake8 --version - -# Quick syntax errors -flake8 \ - graphistry \ - --count \ - --select=E9,F63,F7,F82 \ - --show-source \ - --statistics - -# Deeper check -flake8 \ - graphistry \ - --exclude graphistry/graph_vector_pb2.py,graphistry/_version.py \ - --count \ - --ignore=C901,E121,E122,E123,E124,E125,E128,E131,E144,E201,E202,E203,E231,E251,E265,E301,E302,E303,E401,E501,E722,F401,W291,W293,W503 \ - --max-complexity=10 \ - --max-line-length=127 \ - --statistics +# Resolve flake8 command, then delegate to runner (prefer uvx, then venv) +if command -v uvx >/dev/null 2>&1; then + FLAKE8_CMD="uvx flake8" +elif command -v python >/dev/null 2>&1; then + FLAKE8_CMD="python -m flake8" +else + FLAKE8_CMD="flake8" +fi +FLAKE8_CMD="$FLAKE8_CMD" ./bin/flake8.sh "$@" # Check for relative imports with '..' using flake8-quotes or custom regex # This will fail if any relative imports with .. are found diff --git a/bin/mypy.sh b/bin/mypy.sh index 4d632aa647..5494095f15 100755 --- a/bin/mypy.sh +++ b/bin/mypy.sh @@ -1,27 +1,18 @@ #!/bin/bash set -e -# Try to find the best Python version available -PYTHON_BIN="" -for ver in 3.14 3.13 3.12 3.11 3.10 3.9 3.8; do - if command -v python$ver &> /dev/null; then - PYTHON_BIN=python$ver - break - fi -done +# Minimal resolution: env override or host mypy +MYPY_CMD_ARR=(${MYPY_CMD:-mypy}) -if [ -z "$PYTHON_BIN" ]; then - echo "No suitable Python version found (3.8-3.14)" - exit 1 +MYPY_EXTRA_ARGS_ARR=() +if [ -n "${MYPY_EXTRA_ARGS:-}" ]; then + read -r -a MYPY_EXTRA_ARGS_ARR <<< "$MYPY_EXTRA_ARGS" fi -echo "Using Python: $PYTHON_BIN" -$PYTHON_BIN --version - -# Install mypy if not available -if ! $PYTHON_BIN -m mypy --version &> /dev/null; then - echo "Installing mypy..." - $PYTHON_BIN -m pip install mypy --user +# Ensure mypy exists rather than installing (works in CI & avoids PEP 668) +if ! "${MYPY_CMD_ARR[@]}" --version &> /dev/null; then + echo "mypy not found. Set MYPY_CMD or install mypy on PATH." + exit 1 fi # Get the script directory and repo root @@ -33,32 +24,21 @@ cd "$REPO_ROOT" # Check if specific files were passed as arguments if [ $# -eq 0 ]; then - # No arguments, run on entire graphistry directory using config file - TARGET="" - CONFIG_ARG="--config-file mypy.ini" + TARGET_ARGS=("graphistry") else - # Use provided arguments - TARGET="$@" - # Still use config file for consistency - CONFIG_ARG="--config-file mypy.ini" + TARGET_ARGS=("$@") fi +CONFIG_ARGS=(--config-file mypy.ini) echo "Running mypy..." -if [ -z "$TARGET" ]; then - echo "Checking entire graphistry directory with mypy.ini config" -else - echo "Checking: $TARGET" -fi +echo "Checking: ${TARGET_ARGS[*]}" # Show mypy version -$PYTHON_BIN -m mypy --version +"${MYPY_CMD_ARR[@]}" --version # Run mypy with config file -# If no target specified, mypy will use the paths defined in mypy.ini -if [ -z "$TARGET" ]; then - $PYTHON_BIN -m mypy $CONFIG_ARG graphistry -else - $PYTHON_BIN -m mypy $CONFIG_ARG $TARGET -fi +CMD=( "${MYPY_CMD_ARR[@]}" "${MYPY_EXTRA_ARGS_ARR[@]}" "${CONFIG_ARGS[@]}" ) +CMD+=( "${TARGET_ARGS[@]}" ) +"${CMD[@]}" -echo "Mypy check completed!" \ No newline at end of file +echo "Mypy check completed!" diff --git a/bin/typecheck.sh b/bin/typecheck.sh index b3cfdc13e7..a493b55dae 100755 --- a/bin/typecheck.sh +++ b/bin/typecheck.sh @@ -4,11 +4,12 @@ set -ex # Run from project root # Non-zero exit code on fail -mypy --version - -if [ -n "$PYTHON_VERSION" ]; then - SHORT_VERSION=$(echo "$PYTHON_VERSION" | cut -d. -f1,2) - mypy --python-version "$SHORT_VERSION" --config-file mypy.ini graphistry +# Resolve mypy command, then delegate to runner (prefer uvx, then venv) +if command -v uvx >/dev/null 2>&1; then + MYPY_CMD="uvx mypy" +elif command -v python >/dev/null 2>&1; then + MYPY_CMD="python -m mypy" else - mypy --config-file mypy.ini graphistry + MYPY_CMD="mypy" fi +MYPY_CMD="$MYPY_CMD" ./bin/mypy.sh "$@" diff --git a/docs/docker/build-docs.sh b/docs/docker/build-docs.sh index 217301d3d0..f76d370308 100755 --- a/docs/docker/build-docs.sh +++ b/docs/docker/build-docs.sh @@ -55,6 +55,7 @@ esac NOTEBOOKS_TO_VALIDATE=( "/docs/test_notebooks/test_graphistry_import.ipynb" "/docs/source/demos/gfql/temporal_predicates.ipynb" + "/docs/source/gfql/hop_bounds.ipynb" ) for notebook in "${NOTEBOOKS_TO_VALIDATE[@]}"; do @@ -94,4 +95,4 @@ else: echo "$(basename $notebook) execution completed successfully" fi fi -done \ No newline at end of file +done diff --git a/docs/source/cheatsheet.md b/docs/source/cheatsheet.md index afcfdeeb93..92b9147e8d 100644 --- a/docs/source/cheatsheet.md +++ b/docs/source/cheatsheet.md @@ -906,6 +906,14 @@ g2.plot() # nodes are values from cols s, d, k1 direction='forward', # 'reverse', 'undirected' hops=2, # number (1..n hops, inclusive) or None if to_fixed_point to_fixed_point=False, + # optional traversal range + labeling + min_hops=1, # inclusive lower bound (defaults to 1 unless hops==0) + max_hops=3, # inclusive upper bound; defaults to hops + output_min_hops=None, # optional output slice lower bound (post-filter; defaults keep early hops) + output_max_hops=None, # optional output slice upper bound (post-filter; defaults to max_hops) + label_node_hops='hop', # write first hop step each node is reached (omit/None to skip) + label_edge_hops='edge_hop', # hop step for each traversed edge + label_seeds=True, # also tag starting seeds as hop 0 when labeling #every edge source node must match these source_node_match={"k2": 0, "k3": is_in(['a', 'b', 3, 4])}, @@ -1055,6 +1063,19 @@ g2c = g2.hop( # (a or b)-[1 to 8 hops]->(anynode), based on graph g2 g3 = g2.hop(pd.DataFrame({g2._node: ['a', 'b']}), hops=8) +# Bounded hops with labels and sliced outputs +g4 = g2.hop( + pd.DataFrame({g2._node: ['a']}), + min_hops=2, + max_hops=3, + output_min_hops=2, + output_max_hops=3, + label_node_hops='hop', + label_edge_hops='edge_hop', + label_seeds=True +) +g4._nodes[['node', 'hop']] + # (a or b)-[1 to 8 hops]->(anynode), based on graph g2 g3 = g2.hop(pd.DataFrame({g2._node: is_in(['a', 'b'])}), hops=8) diff --git a/docs/source/conf.py b/docs/source/conf.py index 4fd005eea7..0a232afdd1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -797,4 +797,3 @@ def on_builder(app: Sphinx) -> None: print('No custom handling for app.builder.name=', app.builder.name) app.connect('builder-inited', on_builder) - diff --git a/docs/source/gfql/hop_bounds.ipynb b/docs/source/gfql/hop_bounds.ipynb new file mode 100644 index 0000000000..8c01d30bee --- /dev/null +++ b/docs/source/gfql/hop_bounds.ipynb @@ -0,0 +1,1027 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b956750e", + "metadata": {}, + "source": [ + "# Hop ranges, slices, and labels\n", + "\n", + "GFQL examples for bounded hops, output slices, and labeling (Cypher-style patterns like `(a)-[*2..4]->(b)`).\n", + "\n", + "- Exact and ranged hops (`min_hops`/`max_hops`)\n", + "- Post-filtered output slices (`output_min_hops`/`output_max_hops`)\n", + "- Optional hop labels on nodes/edges; seeds = starting nodes (label seeds at hop 0 when requested)\n" + ] + }, + { + "cell_type": "markdown", + "id": "0086f2cf", + "metadata": {}, + "source": [ + "Visual of the toy branching chain used below (seed at 'a').\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b21b97b8", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-01T23:59:40.523337Z", + "iopub.status.busy": "2025-12-01T23:59:40.523070Z", + "iopub.status.idle": "2025-12-01T23:59:41.274958Z", + "shell.execute_reply": "2025-12-01T23:59:41.273610Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Edges:\n", + " s d\n", + "0 a b1\n", + "1 b1 c1\n", + "2 c1 d1\n", + "3 d1 e1\n", + "4 a b2\n", + "5 b2 c2\n", + "\n", + "Nodes:\n", + " id\n", + "0 a\n", + "1 b1\n", + "2 c1\n", + "3 d1\n", + "4 e1\n", + "5 b2\n", + "6 c2\n", + "\n", + "Seed ids: ['a']\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import graphistry\n", + "from graphistry.compute.ast import is_in, n, e_forward\n", + "\n", + "edges = pd.DataFrame({\n", + " 's': ['a', 'b1', 'c1', 'd1', 'a', 'b2'],\n", + " 'd': ['b1', 'c1', 'd1', 'e1', 'b2', 'c2']\n", + "})\n", + "\n", + "nodes = pd.DataFrame({'id': ['a', 'b1', 'c1', 'd1', 'e1', 'b2', 'c2']})\n", + "\n", + "g = graphistry.edges(edges, 's', 'd').nodes(nodes, 'id')\n", + "seed_ids = ['a']\n", + "\n", + "print('Edges:')\n", + "print(edges)\n", + "print()\n", + "print('Nodes:')\n", + "print(nodes)\n", + "print()\n", + "print('Seed ids:', seed_ids)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2a41ab6a", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-01T23:59:41.277891Z", + "iopub.status.busy": "2025-12-01T23:59:41.277498Z", + "iopub.status.idle": "2025-12-01T23:59:41.291927Z", + "shell.execute_reply": "2025-12-01T23:59:41.290547Z" + } + }, + "outputs": [], + "source": [ + "from IPython.display import SVG\n", + "from html import escape\n", + "import pandas as pd\n", + "\n", + "# Simple SVG renderer for the toy graph (fixed layout)\n", + "pos_lookup = {\n", + " 'a': (50, 100),\n", + " 'b1': (150, 70),\n", + " 'c1': (250, 70),\n", + " 'd1': (350, 70),\n", + " 'e1': (450, 70),\n", + " 'b2': (150, 130),\n", + " 'c2': (250, 130),\n", + "}\n", + "\n", + "def _format_label(val):\n", + " if pd.isna(val):\n", + " return None\n", + " if isinstance(val, (int, float)) and float(val).is_integer():\n", + " val = int(val)\n", + " return str(val)\n", + "\n", + "def render_svg(g_out, label_col=None, title=None):\n", + " node_col = g._node\n", + " src_col, dst_col = g._source, g._destination\n", + " nodes_df = g_out._nodes\n", + " edges_df = g_out._edges\n", + " parts = []\n", + " parts.append(\"\\n\")\n", + " parts.append(\"\\n\\n\")\n", + " if title:\n", + " safe_title = escape(title)\n", + " parts.append(f\"{safe_title}\\n\")\n", + " for _, row in edges_df.iterrows():\n", + " s = row[src_col]; d = row[dst_col]\n", + " if s in pos_lookup and d in pos_lookup:\n", + " x1, y1 = pos_lookup[s]; x2, y2 = pos_lookup[d]\n", + " parts.append(f\"\\n\")\n", + " for _, row in nodes_df.iterrows():\n", + " n = row[node_col]\n", + " if n not in pos_lookup:\n", + " continue\n", + " x, y = pos_lookup[n]\n", + " parts.append(f\"\\n\")\n", + " label_text = escape(str(n))\n", + " if label_col and label_col in nodes_df.columns:\n", + " label_val = _format_label(row[label_col])\n", + " if label_val is not None:\n", + " label_text = f\"{label_text}:{escape(label_val)}\"\n", + " parts.append(f\"{label_text}\\n\")\n", + " parts.append('')\n", + " return SVG(''.join(parts))\n", + "\n", + "# Fill missing hop labels from edge hops (keeps seeds unlabeled when excluded)\n", + "def fill_hops(g_out, label_col='hop', edge_label_col='edge_hop', seeds=None):\n", + " if g_out._nodes is None or g_out._edges is None:\n", + " return g_out\n", + " if label_col not in g_out._nodes.columns or edge_label_col not in g_out._edges.columns:\n", + " return g_out\n", + " nodes_df = g_out._nodes.copy()\n", + " edges_df = g_out._edges\n", + " hop_map = pd.concat([\n", + " edges_df[[g._source, edge_label_col]].rename(columns={g._source: g._node}),\n", + " edges_df[[g._destination, edge_label_col]].rename(columns={g._destination: g._node}),\n", + " ], ignore_index=True, sort=False).groupby(g._node)[edge_label_col].min()\n", + " mask = nodes_df[label_col].isna()\n", + " if seeds is not None:\n", + " mask = mask & ~nodes_df[g._node].isin(seeds)\n", + " if mask.any():\n", + " nodes_df.loc[mask, label_col] = nodes_df.loc[mask, g._node].map(hop_map)\n", + " try:\n", + " nodes_df[label_col] = pd.to_numeric(nodes_df[label_col], errors='coerce').astype('Int64')\n", + " except Exception:\n", + " pass\n", + " return g_out.nodes(nodes_df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d197392a", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-01T23:59:41.295129Z", + "iopub.status.busy": "2025-12-01T23:59:41.294778Z", + "iopub.status.idle": "2025-12-01T23:59:41.305581Z", + "shell.execute_reply": "2025-12-01T23:59:41.304412Z" + } + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "Toy graph layout\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "a\n", + "\n", + "b1\n", + "\n", + "c1\n", + "\n", + "d1\n", + "\n", + "e1\n", + "\n", + "b2\n", + "\n", + "c2\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(render_svg(g, label_col=None, title='Toy graph layout'))\n" + ] + }, + { + "cell_type": "markdown", + "id": "4391e00f", + "metadata": {}, + "source": [ + "## Exact hops\n", + "Exact-hop GFQL edge calls for quick sanity checks before slicing or labeling.\n" + ] + }, + { + "cell_type": "markdown", + "id": "da750f98", + "metadata": {}, + "source": [ + "Exactly 1 hop from seed 'a' to its immediate neighbors (no labels).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4967ba69", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-01T23:59:41.308116Z", + "iopub.status.busy": "2025-12-01T23:59:41.307879Z", + "iopub.status.idle": "2025-12-01T23:59:41.455868Z", + "shell.execute_reply": "2025-12-01T23:59:41.454545Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1-hop nodes from 'a' (first hop):\n", + " id __gfql_output_node_hop__\n", + "0 a \n", + "1 b1 1\n", + "2 b2 1\n", + "\n", + "1-hop edges:\n", + " __gfql_output_edge_hop__ s d\n", + "0 1 a b1\n", + "1 1 a b2\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "1 hop\n", + "\n", + "\n", + "\n", + "a\n", + "\n", + "b1\n", + "\n", + "b2\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Exactly 1 hop (no labels)\n", + "hop_1 = g.gfql([\n", + " n({g._node: is_in(seed_ids)}),\n", + " e_forward(min_hops=1, max_hops=1),\n", + "])\n", + "\n", + "print(\"1-hop nodes from 'a' (first hop):\")\n", + "print(hop_1._nodes.sort_values(g._node))\n", + "print()\n", + "print('1-hop edges:')\n", + "print(hop_1._edges.sort_values([g._source, g._destination]))\n", + "display(render_svg(hop_1, label_col=None, title='1 hop'))" + ] + }, + { + "cell_type": "markdown", + "id": "3c38353e", + "metadata": {}, + "source": [ + "Exactly 3 hops bounded to three steps; default output keeps the earlier hops for context (no labels).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "80057761", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-01T23:59:41.459006Z", + "iopub.status.busy": "2025-12-01T23:59:41.458713Z", + "iopub.status.idle": "2025-12-01T23:59:41.677985Z", + "shell.execute_reply": "2025-12-01T23:59:41.676855Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3-hop nodes from 'a' (path to third hop):\n", + " id __gfql_output_node_hop__\n", + "0 a \n", + "1 b1 1\n", + "2 c1 2\n", + "3 d1 3\n", + "\n", + "3-hop edges:\n", + " __gfql_output_edge_hop__ s d\n", + "0 1 a b1\n", + "1 2 b1 c1\n", + "2 3 c1 d1\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "3 hops\n", + "\n", + "\n", + "\n", + "\n", + "a\n", + "\n", + "b1\n", + "\n", + "c1\n", + "\n", + "d1\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Exactly 3 hops (no labels, default keeps path)\n", + "hop_3 = g.gfql([\n", + " n({g._node: is_in(seed_ids)}),\n", + " e_forward(min_hops=3, max_hops=3),\n", + "])\n", + "\n", + "print(\"3-hop nodes from 'a' (path to third hop):\")\n", + "print(hop_3._nodes.sort_values(g._node))\n", + "print()\n", + "print('3-hop edges:')\n", + "print(hop_3._edges.sort_values([g._source, g._destination]))\n", + "display(render_svg(hop_3, label_col=None, title='3 hops'))\n" + ] + }, + { + "cell_type": "markdown", + "id": "96c52631", + "metadata": {}, + "source": [ + "## Hop ranges\n", + "Variable-length traversal with full-path output (keep hops up to the bound).\n" + ] + }, + { + "cell_type": "markdown", + "id": "3468c224", + "metadata": {}, + "source": [ + "Range 1..3 hops from 'a' (unlabeled) to mirror a Cypher pattern like `(a)-[*1..3]->(?)`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0ab5310b", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-01T23:59:41.680783Z", + "iopub.status.busy": "2025-12-01T23:59:41.680450Z", + "iopub.status.idle": "2025-12-01T23:59:41.889793Z", + "shell.execute_reply": "2025-12-01T23:59:41.888400Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nodes within 1..3 hops from 'a':\n", + " id __gfql_output_node_hop__\n", + "0 a \n", + "1 b1 1\n", + "3 b2 1\n", + "2 c1 2\n", + "5 c2 2\n", + "4 d1 3\n", + "\n", + "Edges within 1..3 hops:\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " __gfql_output_edge_hop__ s d\n", + "0 1 a b1\n", + "3 1 a b2\n", + "1 2 b1 c1\n", + "4 2 b2 c2\n", + "2 3 c1 d1\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "Hops 1..3\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "a\n", + "\n", + "b1\n", + "\n", + "c1\n", + "\n", + "b2\n", + "\n", + "d1\n", + "\n", + "c2\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Range 1..3 hops (combined)\n", + "hop_range = g.gfql([\n", + " n({g._node: is_in(seed_ids)}),\n", + " e_forward(min_hops=1, max_hops=3),\n", + "])\n", + "\n", + "print(\"Nodes within 1..3 hops from 'a':\")\n", + "print(hop_range._nodes.sort_values(g._node))\n", + "print()\n", + "print('Edges within 1..3 hops:')\n", + "print(hop_range._edges.sort_values([g._source, g._destination]))\n", + "display(render_svg(hop_range, label_col=None, title='Hops 1..3'))" + ] + }, + { + "cell_type": "markdown", + "id": "584eb934", + "metadata": {}, + "source": [ + "## Output slicing\n", + "Post-filter the traversal results without changing the traversal itself.\n" + ] + }, + { + "cell_type": "markdown", + "id": "75f493d0", + "metadata": {}, + "source": [ + "Traverse 2..4 hops but only display hops 3..4, with hop numbers on nodes/edges.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4928bfdc", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-01T23:59:41.892458Z", + "iopub.status.busy": "2025-12-01T23:59:41.892138Z", + "iopub.status.idle": "2025-12-01T23:59:42.167012Z", + "shell.execute_reply": "2025-12-01T23:59:42.165892Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nodes in hops 3..4 (after traversing 2..4):\n", + " id hop\n", + "2 c1 3\n", + "0 d1 3\n", + "1 e1 4\n", + "\n", + "Edges in hops 3..4:\n", + " edge_hop s d\n", + "0 3 c1 d1\n", + "1 4 d1 e1\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "Slice hops 3..4\n", + "\n", + "\n", + "\n", + "d1:3\n", + "\n", + "e1:4\n", + "\n", + "c1:3\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Traverse 2..4 hops; output slice 3..4 with hop labels\n", + "hop_slice = g.gfql([\n", + " n({g._node: is_in(seed_ids)}),\n", + " e_forward(min_hops=2, max_hops=4, output_min_hops=3, output_max_hops=4, label_node_hops='hop', label_edge_hops='edge_hop'),\n", + "])\n", + "hop_slice = fill_hops(hop_slice, seeds=seed_ids)\n", + "\n", + "print('Nodes in hops 3..4 (after traversing 2..4):')\n", + "print(hop_slice._nodes.sort_values(['hop', g._node]))\n", + "print()\n", + "print('Edges in hops 3..4:')\n", + "print(hop_slice._edges.sort_values(['edge_hop', g._source, g._destination]))\n", + "display(render_svg(hop_slice, label_col='hop', title='Slice hops 3..4'))" + ] + }, + { + "cell_type": "markdown", + "id": "35972b5c", + "metadata": {}, + "source": [ + "Output_min below min_hops: keep the lead-in hops and label the seed at hop 0.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "75a454a5", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-01T23:59:42.170075Z", + "iopub.status.busy": "2025-12-01T23:59:42.169768Z", + "iopub.status.idle": "2025-12-01T23:59:42.421931Z", + "shell.execute_reply": "2025-12-01T23:59:42.421113Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nodes when output_min < min_hops (labels on, seeds labeled):\n", + " id hop\n", + "0 a 0\n", + "1 b1 1\n", + "2 c1 2\n", + "3 d1 3\n", + "\n", + "Edges when output_min < min_hops:\n", + " edge_hop s d\n", + "0 1 a b1\n", + "1 2 b1 c1\n", + "2 3 c1 d1\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "Output min < min_hops\n", + "\n", + "\n", + "\n", + "\n", + "a:0\n", + "\n", + "b1:1\n", + "\n", + "c1:2\n", + "\n", + "d1:3\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Output slice below traversal min (keeps earlier hops)\n", + "hop_slice_below = g.gfql([\n", + " n({g._node: is_in(seed_ids)}),\n", + " e_forward(min_hops=3, max_hops=3, output_min_hops=1, label_node_hops='hop', label_edge_hops='edge_hop', label_seeds=True),\n", + "])\n", + "hop_slice_below = fill_hops(hop_slice_below, seeds=seed_ids)\n", + "\n", + "print('Nodes when output_min < min_hops (labels on, seeds labeled):')\n", + "print(hop_slice_below._nodes.sort_values(['hop', g._node]))\n", + "print()\n", + "print('Edges when output_min < min_hops:')\n", + "print(hop_slice_below._edges.sort_values(['edge_hop', g._source, g._destination]))\n", + "display(render_svg(hop_slice_below, label_col='hop', title='Output min < min_hops'))" + ] + }, + { + "cell_type": "markdown", + "id": "2f742cce", + "metadata": {}, + "source": [ + "Output_max above traversal: slice is capped at traversal depth; edge labels show the cap.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c7c3a6c2", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-01T23:59:42.425115Z", + "iopub.status.busy": "2025-12-01T23:59:42.424761Z", + "iopub.status.idle": "2025-12-01T23:59:42.638704Z", + "shell.execute_reply": "2025-12-01T23:59:42.637327Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Edges when output_max > traversal max (still capped at traversal):\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " edge_hop s d\n", + "0 1 a b1\n", + "2 1 a b2\n", + "1 2 b1 c1\n", + "3 2 b2 c2\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "Output max > traversal\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "b1\n", + "\n", + "b2\n", + "\n", + "c1\n", + "\n", + "c2\n", + "\n", + "a\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Output slice max above traversal max (allowed, capped by traversal)\n", + "hop_slice_above = g.gfql([\n", + " n({g._node: is_in(seed_ids)}),\n", + " e_forward(min_hops=2, max_hops=2, output_max_hops=5, label_edge_hops='edge_hop'),\n", + "])\n", + "\n", + "print('Edges when output_max > traversal max (still capped at traversal):')\n", + "print(hop_slice_above._edges.sort_values(['edge_hop', g._source, g._destination]))\n", + "display(render_svg(hop_slice_above, label_col=None, title='Output max > traversal'))" + ] + }, + { + "cell_type": "markdown", + "id": "c6a70b78", + "metadata": {}, + "source": [ + "Invalid output slices: examples of raised validation errors for mismatched bounds.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7b742ce8", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-01T23:59:42.641332Z", + "iopub.status.busy": "2025-12-01T23:59:42.641055Z", + "iopub.status.idle": "2025-12-01T23:59:42.646480Z", + "shell.execute_reply": "2025-12-01T23:59:42.645183Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Invalid output_max < min_hops: [invalid-hops-value] output_max_hops cannot be below min_hops traversal bound | field: output_max_hops | value: 1 | suggestion: Raise output_max_hops or lower min_hops\n" + ] + } + ], + "source": [ + "# Invalid output slice (output_min > max_hops)\n", + "bad_output_min_chain = [\n", + " n({g._node: is_in(seed_ids)}),\n", + " e_forward(min_hops=2, max_hops=3, output_min_hops=5),\n", + "]\n", + "\n", + "# Invalid output slice (output_max < min_hops)\n", + "bad_output_max_chain = [\n", + " n({g._node: is_in(seed_ids)}),\n", + " e_forward(min_hops=2, max_hops=3, output_max_hops=1),\n", + "]\n", + "try:\n", + " g.gfql(bad_output_max_chain)\n", + "except Exception as e:\n", + " print('Invalid output_max < min_hops:', e)" + ] + }, + { + "cell_type": "markdown", + "id": "745cd0be", + "metadata": {}, + "source": [ + "## Labels\n", + "Compare hop labels with and without labeling the seeds.\n" + ] + }, + { + "cell_type": "markdown", + "id": "20369759", + "metadata": {}, + "source": [ + "Labels without seeds: hop numbers start at 1 for new nodes/edges; seed 'a' stays unlabeled.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4cb78f8b", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-01T23:59:42.649650Z", + "iopub.status.busy": "2025-12-01T23:59:42.649260Z", + "iopub.status.idle": "2025-12-01T23:59:42.891641Z", + "shell.execute_reply": "2025-12-01T23:59:42.890314Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nodes with hop labels (seeds not labeled):\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " id hop\n", + "1 b1 1\n", + "3 b2 1\n", + "2 c1 2\n", + "5 c2 2\n", + "4 d1 3\n", + "0 a \n", + "\n", + "Edges with hop labels (seeds not labeled):\n", + " edge_hop s d\n", + "0 1 a b1\n", + "3 1 a b2\n", + "1 2 b1 c1\n", + "4 2 b2 c2\n", + "2 3 c1 d1\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "Labels (no seeds)\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "a\n", + "\n", + "b1:1\n", + "\n", + "c1:2\n", + "\n", + "b2:1\n", + "\n", + "d1:3\n", + "\n", + "c2:2\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Hop labels without seed labels\n", + "hop_labels_off = g.gfql([\n", + " n({g._node: is_in(seed_ids)}),\n", + " e_forward(min_hops=1, max_hops=3, label_node_hops='hop', label_edge_hops='edge_hop', label_seeds=False),\n", + "])\n", + "hop_labels_off = fill_hops(hop_labels_off, seeds=seed_ids)\n", + "\n", + "print('Nodes with hop labels (seeds not labeled):')\n", + "print(hop_labels_off._nodes.sort_values(['hop', g._node]))\n", + "print()\n", + "print('Edges with hop labels (seeds not labeled):')\n", + "print(hop_labels_off._edges.sort_values(['edge_hop', g._source, g._destination]))\n", + "display(render_svg(hop_labels_off, label_col='hop', title='Labels (no seeds)'))" + ] + }, + { + "cell_type": "markdown", + "id": "9f72c319", + "metadata": {}, + "source": [ + "Labels with seeds: seed labeled hop 0; downstream hops increment from there.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "efbdff59", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-01T23:59:42.894439Z", + "iopub.status.busy": "2025-12-01T23:59:42.894083Z", + "iopub.status.idle": "2025-12-01T23:59:43.129411Z", + "shell.execute_reply": "2025-12-01T23:59:43.128136Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nodes with hop labels (seeds labeled hop=0):\n", + " id hop\n", + "0 a 0\n", + "1 b1 1\n", + "2 b2 1\n", + "3 c1 2\n", + "4 c2 2\n", + "5 d1 3\n", + "\n", + "Edges with hop labels (seeds labeled hop=0):\n", + " edge_hop s d\n", + "0 1 a b1\n", + "3 1 a b2\n", + "1 2 b1 c1\n", + "4 2 b2 c2\n", + "2 3 c1 d1\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "Labels (seeds=0)\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "a:0\n", + "\n", + "b1:1\n", + "\n", + "b2:1\n", + "\n", + "c1:2\n", + "\n", + "c2:2\n", + "\n", + "d1:3\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Hop labels with seed labels\n", + "hop_labels_on = g.gfql([\n", + " n({g._node: is_in(seed_ids)}),\n", + " e_forward(min_hops=1, max_hops=3, label_node_hops='hop', label_edge_hops='edge_hop', label_seeds=True),\n", + "])\n", + "hop_labels_on = fill_hops(hop_labels_on, seeds=seed_ids)\n", + "\n", + "print('Nodes with hop labels (seeds labeled hop=0):')\n", + "print(hop_labels_on._nodes.sort_values(['hop', g._node]))\n", + "print()\n", + "print('Edges with hop labels (seeds labeled hop=0):')\n", + "print(hop_labels_on._edges.sort_values(['edge_hop', g._source, g._destination]))\n", + "display(render_svg(hop_labels_on, label_col='hop', title='Labels (seeds=0)'))" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/gfql/quick.rst b/docs/source/gfql/quick.rst index 325c079131..74880b0e88 100644 --- a/docs/source/gfql/quick.rst +++ b/docs/source/gfql/quick.rst @@ -61,12 +61,12 @@ Edge Matchers .. code-block:: python - e_forward(edge_match=None, hops=1, to_fixed_point=False, source_node_match=None, destination_node_match=None, source_node_query=None, destination_node_query=None, edge_query=None, name=None) - e_reverse(edge_match=None, hops=1, to_fixed_point=False, source_node_match=None, destination_node_match=None, source_node_query=None, destination_node_query=None, edge_query=None, name=None) - e_undirected(edge_match=None, hops=1, to_fixed_point=False, source_node_match=None, destination_node_match=None, source_node_query=None, destination_node_query=None, edge_query=None, name=None) + e_forward(edge_match=None, hops=1, min_hops=None, max_hops=None, output_min_hops=None, output_max_hops=None, label_node_hops=None, label_edge_hops=None, label_seeds=False, to_fixed_point=False, source_node_match=None, destination_node_match=None, source_node_query=None, destination_node_query=None, edge_query=None, name=None) + e_reverse(edge_match=None, hops=1, min_hops=None, max_hops=None, output_min_hops=None, output_max_hops=None, label_node_hops=None, label_edge_hops=None, label_seeds=False, to_fixed_point=False, source_node_match=None, destination_node_match=None, source_node_query=None, destination_node_query=None, edge_query=None, name=None) + e_undirected(edge_match=None, hops=1, min_hops=None, max_hops=None, output_min_hops=None, output_max_hops=None, label_node_hops=None, label_edge_hops=None, label_seeds=False, to_fixed_point=False, source_node_match=None, destination_node_match=None, source_node_query=None, destination_node_query=None, edge_query=None, name=None) # alias for e_undirected - e(edge_match=None, hops=1, to_fixed_point=False, source_node_match=None, destination_node_match=None, source_node_query=None, destination_node_query=None, edge_query=None, name=None) + e(edge_match=None, hops=1, min_hops=None, max_hops=None, output_min_hops=None, output_max_hops=None, label_node_hops=None, label_edge_hops=None, label_seeds=False, to_fixed_point=False, source_node_match=None, destination_node_match=None, source_node_query=None, destination_node_query=None, edge_query=None, name=None) :meth:`e ` matches edges based on their attributes (undirected). May also include matching on edge's source and destination nodes. @@ -77,6 +77,9 @@ Edge Matchers - `edge_match`: `{attribute: value}` or `{attribute: condition_function}` - `edge_query`: Custom query string for edge attributes. - `hops`: `int`, number of hops to traverse. + - `min_hops`/`max_hops`: Inclusive traversal bounds (default to `hops`). + - `output_min_hops`/`output_max_hops`: Optional post-filter slice; defaults keep all traversed hops up to `max_hops`. + - `label_node_hops`/`label_edge_hops`: Optional column names for hop numbers; `label_seeds=True` adds hop 0 for seeds. - `to_fixed_point`: `bool`, continue traversal until no more matches. - `source_node_match`: Filter for source nodes. - `destination_node_match`: Filter for destination nodes. @@ -92,6 +95,18 @@ Edge Matchers e_forward({"status": "active"}, hops=2) +- Traverse 2..4 hops but show only hops 3..4 with labels: + + .. code-block:: python + + e_forward( + {"status": "active"}, + min_hops=2, + max_hops=4, + output_min_hops=3, + label_edge_hops="edge_hop" + ) + - Use custom edge query strings: .. code-block:: python diff --git a/docs/source/gfql/spec/cypher_mapping.md b/docs/source/gfql/spec/cypher_mapping.md index 80e873537f..5f81572996 100644 --- a/docs/source/gfql/spec/cypher_mapping.md +++ b/docs/source/gfql/spec/cypher_mapping.md @@ -81,9 +81,11 @@ g.gfql([ | `-[r:KNOWS]->` | `e_forward({"type": "KNOWS"}, name="r")` | `{"type": "Edge", "direction": "forward", "edge_match": {"type": "KNOWS"}, "name": "r"}` | | `<-[r]-` | `e_reverse(name="r")` | `{"type": "Edge", "direction": "reverse", "name": "r"}` | | `-[r]-` | `e(name="r")` | `{"type": "Edge", "direction": "undirected", "name": "r"}` | -| `-[*2]->` | `e_forward(hops=2)` | `{"type": "Edge", "direction": "forward", "hops": 2}` | -| `-[*1..3]->` | `e_forward(hops=3)` | `{"type": "Edge", "direction": "forward", "hops": 3}` | # upper-bound only; lower bound = 1 | -| `-[*]->` | `e_forward(to_fixed_point=True)` | `{"type": "Edge", "direction": "forward", "to_fixed_point": true}` | +| `(n1)-[*2]->(n2)` | `e_forward(hops=2)` | `{"type": "Edge", "direction": "forward", "hops": 2}` | +| `(n1)-[*1..3]->(n2)` | `e_forward(min_hops=1, max_hops=3)` | `{"type": "Edge", "direction": "forward", "min_hops": 1, "max_hops": 3}` | +| `(n1)-[*3..3]->(n2)` | `e_forward(min_hops=3, max_hops=3)` | `{"type": "Edge", "direction": "forward", "min_hops": 3, "max_hops": 3}` | +| `(n1)-[*2..4]->(n2)` but only show hops 3..4 | `e_forward(min_hops=2, max_hops=4, output_min_hops=3, label_edge_hops="edge_hop")` | `{"type": "Edge", "direction": "forward", "min_hops": 2, "max_hops": 4, "output_min_hops": 3, "label_edge_hops": "edge_hop"}` | +| `(n1)-[*]->(n2)` | `e_forward(to_fixed_point=True)` | `{"type": "Edge", "direction": "forward", "to_fixed_point": true}` | | `-[r:BOUGHT {amount: gt(100)}]->` | `e_forward({"type": "BOUGHT", "amount": gt(100)}, name="r")` | `{"type": "Edge", "direction": "forward", "edge_match": {"type": "BOUGHT", "amount": {"type": "GT", "val": 100}}, "name": "r"}` | ### Predicates @@ -255,4 +257,4 @@ Rules: ## See Also - {ref}`gfql-spec-wire-protocol` - Full wire protocol specification - {ref}`gfql-spec-language` - Language specification -- {ref}`gfql-spec-python-embedding` - Python implementation details \ No newline at end of file +- {ref}`gfql-spec-python-embedding` - Python implementation details diff --git a/docs/source/gfql/spec/language.md b/docs/source/gfql/spec/language.md index 1817612b97..0adb21e56a 100644 --- a/docs/source/gfql/spec/language.md +++ b/docs/source/gfql/spec/language.md @@ -107,7 +107,10 @@ edge_params ::= edge_match_params ("," hop_params)? ("," node_filter_params)? (" filter_dict ::= "{" (property_filter ("," property_filter)*)? "}" property_filter ::= string ":" (value | predicate) -hop_params ::= "hops=" integer | "to_fixed_point=True" +hop_params ::= hop_bound_params | hop_slice_params | hop_label_params | "hops=" integer | "to_fixed_point=True" +hop_bound_params ::= "min_hops=" integer | "max_hops=" integer +hop_slice_params ::= "output_min_hops=" integer | "output_max_hops=" integer +hop_label_params ::= "label_node_hops=" string | "label_edge_hops=" string | "label_seeds=True" node_filter_params ::= source_filter ("," dest_filter)? source_filter ::= "source_node_match=" filter_dict | "source_node_query=" string dest_filter ::= "destination_node_match=" filter_dict | "destination_node_query=" string @@ -185,11 +188,14 @@ n(query="age > 30 and status == 'active'") # Query string Traverses edges in forward direction (source → destination). -**Syntax**: `e_forward(edge_match?, hops?, to_fixed_point?, source_node_match?, destination_node_match?, name?)` +**Syntax**: `e_forward(edge_match?, hops?, min_hops?, max_hops?, output_min_hops?, output_max_hops?, label_node_hops?, label_edge_hops?, label_seeds?, to_fixed_point?, source_node_match?, destination_node_match?, name?)` **Parameters**: - `edge_match`: Edge attribute filters -- `hops`: Number of hops (default: 1) +- `hops`: Number of hops (default: 1; shorthand for `max_hops`) +- `min_hops`/`max_hops`: Inclusive traversal bounds (default min=1 unless max=0; max defaults to hops) +- `output_min_hops`/`output_max_hops`: Optional post-filter slice; defaults keep all traversed hops up to `max_hops` +- `label_node_hops`/`label_edge_hops`: Optional hop-number columns; `label_seeds=True` writes hop 0 for seeds when labeling - `to_fixed_point`: Continue until no new nodes (default: False) - `source_node_match`: Filters for source nodes - `destination_node_match`: Filters for destination nodes @@ -199,6 +205,7 @@ Traverses edges in forward direction (source → destination). ```python e_forward() # One hop forward e_forward(hops=2) # Two hops forward +e_forward(min_hops=2, max_hops=4, output_min_hops=3, label_edge_hops="edge_hop") # bounded + sliced + labeled e_forward(to_fixed_point=True) # All reachable nodes e_forward({"type": "follows"}) # Only 'follows' edges e_forward(source_node_match={"active": True}) # From active nodes diff --git a/docs/source/gfql/spec/llm_guide.md b/docs/source/gfql/spec/llm_guide.md index 84a307385d..7b6e3921ab 100644 --- a/docs/source/gfql/spec/llm_guide.md +++ b/docs/source/gfql/spec/llm_guide.md @@ -39,7 +39,7 @@ ## Quick Example: Fraud Detection -**Dense:** `let({'suspicious': n({'risk_score': gt(80)}), 'flows': ref('suspicious', [e_forward(hops=3), n()]), 'ranked': ref('flows', [call('compute_cugraph', {'alg': 'pagerank'})]), 'viz': ref('ranked', [call('encode_point_color', {...}), call('encode_point_icon', {...})])})` +**Dense:** `let({'suspicious': n({'risk_score': gt(80)}), 'flows': ref('suspicious', [e_forward(min_hops=1, max_hops=3), n()]), 'ranked': ref('flows', [call('compute_cugraph', {'alg': 'pagerank'})]), 'viz': ref('ranked', [call('encode_point_color', {...}), call('encode_point_icon', {...})])})` **JSON:** ```json @@ -54,7 +54,7 @@ "type": "ChainRef", "ref": "suspicious", "chain": [ - {"type": "Edge", "direction": "forward", "hops": 3, "to_fixed_point": false, + {"type": "Edge", "direction": "forward", "min_hops": 1, "max_hops": 3, "to_fixed_point": false, "edge_match": {"amount": {"type": "GT", "val": 10000}}}, {"type": "Node", "filter_dict": {}} ] @@ -101,7 +101,13 @@ { "type": "Edge", "direction": "forward|reverse|undirected", // required - "hops": 1, // default: 1 + "max_hops": 1, // default: 1 (hops shorthand) + "min_hops": 1, // optional; default 1 unless max_hops is 0 + "output_min_hops": 1, // optional post-filter slice; defaults keep 1..max_hops + "output_max_hops": 1, // optional post-filter cap; defaults to max_hops + "label_node_hops": "hop", // optional; omit/null to skip node hop labels + "label_edge_hops": "edge_hop", // optional; omit/null to skip edge hop labels + "label_seeds": false, // optional; when true, label seeds at hop 0 "to_fixed_point": false, // default: false "edge_match": {filters}, // optional "source_node_match": {filters}, // optional @@ -180,14 +186,14 @@ **Multi-hop (friends of friends):** ```python -# Dense: [n({'name': 'Alice'}), e_forward(hops=2), n()] +# Dense: [n({'name': 'Alice'}), e_forward(min_hops=1, max_hops=2), n()] ``` ```json { "type": "Chain", "chain": [ {"type": "Node", "filter_dict": {"name": "Alice"}}, - {"type": "Edge", "direction": "forward", "hops": 2, "to_fixed_point": false}, + {"type": "Edge", "direction": "forward", "min_hops": 1, "max_hops": 2, "to_fixed_point": false}, {"type": "Node", "filter_dict": {}} ] } @@ -330,9 +336,9 @@ ### Traversals & Filters -**Hop (Multi-step):** `call('hop', {'hops': 3, 'direction': 'forward'})` +**Hop (Multi-step):** `call('hop', {'min_hops': 1, 'max_hops': 3, 'direction': 'forward'})` ```json -{"type": "Call", "function": "hop", "params": {"hops": 3, "direction": "forward"}} +{"type": "Call", "function": "hop", "params": {"min_hops": 1, "max_hops": 3, "direction": "forward"}} ``` **Filter Nodes:** `call('filter_nodes_by_dict', {'query': {'type': 'Person', 'age': {'type': 'GT', 'val': 30}}})` @@ -537,8 +543,9 @@ See [Quick Example](#quick-example-fraud-detection) for full JSON example. 1. **Always include `type` field** in every object 2. **Chain wraps operations** - use `{"type": "Chain", "chain": [...]}` -3. **Edge defaults:** `direction: "forward"`, `hops: 1`, `to_fixed_point: false` -4. **Empty filters:** Use `{}` for match-all +3. **Edge defaults:** `direction: "forward"`, `max_hops: 1` (`hops` shorthand), `min_hops: 1` unless `max_hops` is 0, `to_fixed_point: false` +4. **Output slice defaults:** If `output_min_hops`/`output_max_hops` are omitted, results keep all traversed hops up to `max_hops`; set them to post-filter displayed hops. +5. **Empty filters:** Use `{}` for match-all 5. **Predicates:** Wrap comparisons: `{"type": "GT", "val": 100}` 6. **Temporal:** Tag values: `{"type": "datetime", "value": "...", "timezone": "UTC"}` 7. **ChainRef:** Reference bindings: `{"type": "ChainRef", "ref": "name", "chain": [...]}` @@ -553,7 +560,7 @@ See [Quick Example](#quick-example-fraud-detection) for full JSON example. **Wrong:** Raw datetime: `{"timestamp": "2024-01-01"}` **Correct:** `{"timestamp": {"type": "GT", "val": {"type": "datetime", "value": "2024-01-01T00:00:00"}}}` -**Wrong:** Forgot to_fixed_point: `{"hops": 999}` for "traverse all" +**Wrong:** Forgot to_fixed_point: `{"max_hops": 999}` for "traverse all" **Correct:** `{"to_fixed_point": true}` **Wrong:** Using `"backward"` instead of `"reverse"` diff --git a/docs/source/gfql/spec/wire_protocol.md b/docs/source/gfql/spec/wire_protocol.md index e39f64bafe..9693a678e2 100644 --- a/docs/source/gfql/spec/wire_protocol.md +++ b/docs/source/gfql/spec/wire_protocol.md @@ -84,7 +84,10 @@ n({"type": "person", "age": gt(30)}, name="adults") ```python e_forward( {"type": "transaction"}, - hops=2, + min_hops=2, + max_hops=4, + output_min_hops=3, + label_edge_hops="edge_hop", source_node_match={"active": True}, name="txns" ) @@ -95,17 +98,23 @@ e_forward( { "type": "Edge", "direction": "forward", - "edge_match": { - "type": "transaction" - }, - "hops": 2, - "source_node_match": { - "active": true - }, + "edge_match": { "type": "transaction" }, + "min_hops": 2, + "max_hops": 4, + "output_min_hops": 3, + "label_edge_hops": "edge_hop", + "source_node_match": { "active": true }, "name": "txns" } ``` +Optional fields: +- `hops` (shorthand for `max_hops`) +- `output_min_hops` +- `output_max_hops` +- `label_node_hops`, `label_edge_hops`, `label_seeds` +- `to_fixed_point` + ### Chain **Python**: diff --git a/docs/source/gfql/translate.rst b/docs/source/gfql/translate.rst index 9baf34bce4..0014c70b98 100644 --- a/docs/source/gfql/translate.rst +++ b/docs/source/gfql/translate.rst @@ -171,6 +171,22 @@ Performing Multi-Hop Traversals n({g._node: "Alice"}), e_forward(), e_forward(), n(name='m') ])._nodes.query('m') +**GFQL (bounded hop alternative)** + +.. code-block:: python + + # Same intent using hop() with explicit bounds + optional labels + g.hop( + nodes=pd.DataFrame({g._node: ['Alice']}), + min_hops=2, + max_hops=2 + ) + +**Explanation**: + +- `min_hops`/`max_hops` match Cypher's variable-length pattern (`[*2..2]`) while remaining dataframish. If you set `label_node_hops`/`label_edge_hops`, those column names will store the hop step (nodes = first arrival, edges = traversal step); omit or `None` to skip labels. +- `output_min_hops`/`output_max_hops` (optional) slice the displayed hops after traversal. By default, all traversed hops up to `max_hops` remain visible; set `output_min_hops` if you want to drop early hops (e.g., traverse 2..4 but only show 3..4). Invalid slices (e.g., `output_min_hops` > `max_hops` or `output_max_hops` < `min_hops`) raise a `ValueError`. + **Explanation**: - **GFQL**: Starts at node `"Alice"`, performs two forward hops, and obtains nodes two steps away. Results are in `nodes_df`. Building on the expressive and performance benefits of the previous 1-hop example, it begins adding the parallel path finding benefits of GFQL over Cypher, which benefits both CPU and GPU usage. diff --git a/docs/source/notebooks/gfql.rst b/docs/source/notebooks/gfql.rst index a967eae076..8ea093428c 100644 --- a/docs/source/notebooks/gfql.rst +++ b/docs/source/notebooks/gfql.rst @@ -7,9 +7,9 @@ GFQL Graph queries :titlesonly: Intro to graph queries with hop and chain <../demos/more_examples/graphistry_features/hop_and_chain_graph_pattern_mining.ipynb> + Hop ranges, slices, and labels <../gfql/hop_bounds.ipynb> GFQL Validation Fundamentals <../demos/gfql/gfql_validation_fundamentals.ipynb> DateTime Filtering Examples <../demos/gfql/temporal_predicates.ipynb> GPU Benchmarking <../demos/gfql/benchmark_hops_cpu_gpu.ipynb> GFQL Remote mode <../demos/gfql/gfql_remote.ipynb> Python Remote mode <../demos/gfql/python_remote.ipynb> - diff --git a/graphistry/Plottable.py b/graphistry/Plottable.py index 627cd73af2..91eacc2044 100644 --- a/graphistry/Plottable.py +++ b/graphistry/Plottable.py @@ -409,6 +409,14 @@ def collapse( def hop(self, nodes: Optional[pd.DataFrame], hops: Optional[int] = 1, + *, + min_hops: Optional[int] = None, + max_hops: Optional[int] = None, + output_min_hops: Optional[int] = None, + output_max_hops: Optional[int] = None, + label_node_hops: Optional[str] = None, + label_edge_hops: Optional[str] = None, + label_seeds: bool = False, to_fixed_point: bool = False, direction: str = 'forward', edge_match: Optional[dict] = None, diff --git a/graphistry/compute/ast.py b/graphistry/compute/ast.py index 57d734fd2b..df912fe410 100644 --- a/graphistry/compute/ast.py +++ b/graphistry/compute/ast.py @@ -272,6 +272,13 @@ def __init__( direction: Optional[Direction] = DEFAULT_DIRECTION, edge_match: Optional[dict] = DEFAULT_FILTER_DICT, hops: Optional[int] = DEFAULT_HOPS, + min_hops: Optional[int] = None, + max_hops: Optional[int] = None, + output_min_hops: Optional[int] = None, + output_max_hops: Optional[int] = None, + label_node_hops: Optional[str] = None, + label_edge_hops: Optional[str] = None, + label_seeds: bool = False, to_fixed_point: bool = DEFAULT_FIXED_POINT, source_node_match: Optional[dict] = DEFAULT_FILTER_DICT, destination_node_match: Optional[dict] = DEFAULT_FILTER_DICT, @@ -293,6 +300,13 @@ def __init__( destination_node_match = None self.hops = hops + self.min_hops = min_hops + self.max_hops = max_hops + self.output_min_hops = output_min_hops + self.output_max_hops = output_max_hops + self.label_node_hops = label_node_hops + self.label_edge_hops = label_edge_hops + self.label_seeds = label_seeds self.to_fixed_point = to_fixed_point self.direction : Direction = direction self.source_node_match = source_node_match @@ -303,7 +317,7 @@ def __init__( self.edge_query = edge_query def __repr__(self) -> str: - return f'ASTEdge(direction={self.direction}, edge_match={self.edge_match}, hops={self.hops}, to_fixed_point={self.to_fixed_point}, source_node_match={self.source_node_match}, destination_node_match={self.destination_node_match}, name={self._name}, source_node_query={self.source_node_query}, destination_node_query={self.destination_node_query}, edge_query={self.edge_query})' + return f'ASTEdge(direction={self.direction}, edge_match={self.edge_match}, hops={self.hops}, min_hops={self.min_hops}, max_hops={self.max_hops}, output_min_hops={self.output_min_hops}, output_max_hops={self.output_max_hops}, label_node_hops={self.label_node_hops}, label_edge_hops={self.label_edge_hops}, label_seeds={self.label_seeds}, to_fixed_point={self.to_fixed_point}, source_node_match={self.source_node_match}, destination_node_match={self.destination_node_match}, name={self._name}, source_node_query={self.source_node_query}, destination_node_query={self.destination_node_query}, edge_query={self.edge_query})' def _validate_fields(self) -> None: """Validate edge fields.""" @@ -318,6 +332,77 @@ def _validate_fields(self) -> None: suggestion="Use hops=2 for specific count, or to_fixed_point=True for unbounded", ) + # Validate hop bounds/slices + for field_name, field_val in [ + ("min_hops", self.min_hops), + ("max_hops", self.max_hops), + ("output_min_hops", self.output_min_hops), + ("output_max_hops", self.output_max_hops), + ]: + if field_val is not None and (not isinstance(field_val, int) or field_val < 0): + raise GFQLTypeError( + ErrorCode.E103, + f"{field_name} must be a non-negative integer or None", + field=field_name, + value=field_val, + ) + + if self.min_hops is not None and self.max_hops is not None and self.min_hops > self.max_hops: + raise GFQLTypeError( + ErrorCode.E103, + "min_hops cannot exceed max_hops", + field="min_hops", + value=self.min_hops, + suggestion="Set min_hops <= max_hops", + ) + + if self.output_min_hops is not None and self.output_max_hops is not None and self.output_min_hops > self.output_max_hops: + raise GFQLTypeError( + ErrorCode.E103, + "output_min_hops cannot exceed output_max_hops", + field="output_min_hops", + value=self.output_min_hops, + suggestion="Set output_min_hops <= output_max_hops", + ) + + if self.output_min_hops is not None and self.max_hops is not None and self.output_min_hops > self.max_hops: + raise GFQLTypeError( + ErrorCode.E103, + "output_min_hops cannot exceed max_hops traversal bound", + field="output_min_hops", + value=self.output_min_hops, + suggestion="Lower output_min_hops or raise max_hops", + ) + + if self.output_max_hops is not None and self.min_hops is not None and self.output_max_hops < self.min_hops: + raise GFQLTypeError( + ErrorCode.E103, + "output_max_hops cannot be below min_hops traversal bound", + field="output_max_hops", + value=self.output_max_hops, + suggestion="Raise output_max_hops or lower min_hops", + ) + + for label_field, label_val in [ + ("label_node_hops", self.label_node_hops), + ("label_edge_hops", self.label_edge_hops), + ]: + if label_val is not None and not isinstance(label_val, str): + raise GFQLTypeError( + ErrorCode.E204, + f"{label_field} must be a string when provided", + field=label_field, + value=type(label_val).__name__, + ) + + if not isinstance(self.label_seeds, bool): + raise GFQLTypeError( + ErrorCode.E201, + "label_seeds must be a boolean", + field="label_seeds", + value=type(self.label_seeds).__name__, + ) + # Validate to_fixed_point if not isinstance(self.to_fixed_point, bool): raise GFQLTypeError( @@ -402,6 +487,13 @@ def to_json(self, validate=True) -> dict: return { 'type': 'Edge', 'hops': self.hops, + **({'min_hops': self.min_hops} if self.min_hops is not None else {}), + **({'max_hops': self.max_hops} if self.max_hops is not None else {}), + **({'output_min_hops': self.output_min_hops} if self.output_min_hops is not None else {}), + **({'output_max_hops': self.output_max_hops} if self.output_max_hops is not None else {}), + **({'label_node_hops': self.label_node_hops} if self.label_node_hops is not None else {}), + **({'label_edge_hops': self.label_edge_hops} if self.label_edge_hops is not None else {}), + **({'label_seeds': self.label_seeds} if self.label_seeds else {}), 'to_fixed_point': self.to_fixed_point, 'direction': self.direction, **({'source_node_match': { @@ -431,6 +523,13 @@ def from_json(cls, d: dict, validate: bool = True) -> 'ASTEdge': direction=d['direction'] if 'direction' in d else None, edge_match=maybe_filter_dict_from_json(d, 'edge_match'), hops=d['hops'] if 'hops' in d else None, + min_hops=d.get('min_hops'), + max_hops=d.get('max_hops'), + output_min_hops=d.get('output_min_hops'), + output_max_hops=d.get('output_max_hops'), + label_node_hops=d.get('label_node_hops'), + label_edge_hops=d.get('label_edge_hops'), + label_seeds=d.get('label_seeds', False), to_fixed_point=d['to_fixed_point'] if 'to_fixed_point' in d else DEFAULT_FIXED_POINT, source_node_match=maybe_filter_dict_from_json(d, 'source_node_match'), destination_node_match=maybe_filter_dict_from_json(d, 'destination_node_match'), @@ -460,9 +559,33 @@ def __call__( logger.debug('g._edges:\n%s\n', g._edges) logger.debug('----------------------------------------') + wants_output_slice = self.output_min_hops is not None or self.output_max_hops is not None + return_wavefront = True # AST edges are used in chain/gfql wavefront mode + # Avoid slicing during traversal but keep hop labels so the final combine step can filter. + resolved_output_min = None if return_wavefront else self.output_min_hops + resolved_output_max = None if return_wavefront else self.output_max_hops + # Use declared min_hops for traversal; hop.py handles path pruning for min_hops > 1 + resolved_min_hops = self.min_hops + resolved_max_hops = self.max_hops + + label_node_hops = self.label_node_hops + label_edge_hops = self.label_edge_hops + needs_auto_labels = wants_output_slice or (self.min_hops is not None and self.min_hops > 0) + if return_wavefront and needs_auto_labels: + # Ensure hop labels exist for post-filtering even if user didn't request explicit labels + label_node_hops = label_node_hops or '__gfql_output_node_hop__' + label_edge_hops = label_edge_hops or '__gfql_output_edge_hop__' + out_g = g.hop( nodes=prev_node_wavefront, hops=self.hops, + min_hops=resolved_min_hops, + max_hops=resolved_max_hops, + output_min_hops=resolved_output_min, + output_max_hops=resolved_output_max, + label_node_hops=label_node_hops, + label_edge_hops=label_edge_hops, + label_seeds=self.label_seeds, to_fixed_point=self.to_fixed_point, direction=self.direction, source_node_match=self.source_node_match, @@ -493,10 +616,20 @@ def reverse(self) -> 'ASTEdge': direction = 'reverse' else: direction = 'undirected' + # The reverse pass validates path completeness, not hop constraints. + # Forward pass already pruned dead-end branches; reverse just needs to traverse + # the remaining edges back to seeds. Use min_hops=None to skip re-pruning. return ASTEdge( direction=direction, edge_match=self.edge_match, hops=self.hops, + min_hops=None, + max_hops=self.max_hops, + output_min_hops=None, + output_max_hops=None, + label_node_hops=self.label_node_hops, + label_edge_hops=self.label_edge_hops, + label_seeds=self.label_seeds, to_fixed_point=self.to_fixed_point, source_node_match=self.destination_node_match, destination_node_match=self.source_node_match, @@ -514,6 +647,13 @@ def __init__( self, edge_match: Optional[dict] = DEFAULT_FILTER_DICT, hops: Optional[int] = DEFAULT_HOPS, + min_hops: Optional[int] = None, + max_hops: Optional[int] = None, + output_min_hops: Optional[int] = None, + output_max_hops: Optional[int] = None, + label_node_hops: Optional[str] = None, + label_edge_hops: Optional[str] = None, + label_seeds: bool = False, source_node_match: Optional[dict] = DEFAULT_FILTER_DICT, destination_node_match: Optional[dict] = DEFAULT_FILTER_DICT, to_fixed_point: bool = DEFAULT_FIXED_POINT, @@ -526,6 +666,13 @@ def __init__( direction='forward', edge_match=edge_match, hops=hops, + min_hops=min_hops, + max_hops=max_hops, + output_min_hops=output_min_hops, + output_max_hops=output_max_hops, + label_node_hops=label_node_hops, + label_edge_hops=label_edge_hops, + label_seeds=label_seeds, source_node_match=source_node_match, destination_node_match=destination_node_match, to_fixed_point=to_fixed_point, @@ -540,6 +687,13 @@ def from_json(cls, d: dict, validate: bool = True) -> 'ASTEdge': out = ASTEdgeForward( edge_match=maybe_filter_dict_from_json(d, 'edge_match'), hops=d['hops'] if 'hops' in d else None, + min_hops=d.get('min_hops'), + max_hops=d.get('max_hops'), + output_min_hops=d.get('output_min_hops'), + output_max_hops=d.get('output_max_hops'), + label_node_hops=d.get('label_node_hops'), + label_edge_hops=d.get('label_edge_hops'), + label_seeds=d.get('label_seeds', False), to_fixed_point=d['to_fixed_point'] if 'to_fixed_point' in d else DEFAULT_FIXED_POINT, source_node_match=maybe_filter_dict_from_json(d, 'source_node_match'), destination_node_match=maybe_filter_dict_from_json(d, 'destination_node_match'), @@ -564,6 +718,13 @@ def __init__( self, edge_match: Optional[dict] = DEFAULT_FILTER_DICT, hops: Optional[int] = DEFAULT_HOPS, + min_hops: Optional[int] = None, + max_hops: Optional[int] = None, + output_min_hops: Optional[int] = None, + output_max_hops: Optional[int] = None, + label_node_hops: Optional[str] = None, + label_edge_hops: Optional[str] = None, + label_seeds: bool = False, source_node_match: Optional[dict] = DEFAULT_FILTER_DICT, destination_node_match: Optional[dict] = DEFAULT_FILTER_DICT, to_fixed_point: bool = DEFAULT_FIXED_POINT, @@ -576,6 +737,13 @@ def __init__( direction='reverse', edge_match=edge_match, hops=hops, + min_hops=min_hops, + max_hops=max_hops, + output_min_hops=output_min_hops, + output_max_hops=output_max_hops, + label_node_hops=label_node_hops, + label_edge_hops=label_edge_hops, + label_seeds=label_seeds, source_node_match=source_node_match, destination_node_match=destination_node_match, to_fixed_point=to_fixed_point, @@ -590,6 +758,13 @@ def from_json(cls, d: dict, validate: bool = True) -> 'ASTEdge': out = ASTEdgeReverse( edge_match=maybe_filter_dict_from_json(d, 'edge_match'), hops=d['hops'] if 'hops' in d else None, + min_hops=d.get('min_hops'), + max_hops=d.get('max_hops'), + output_min_hops=d.get('output_min_hops'), + output_max_hops=d.get('output_max_hops'), + label_node_hops=d.get('label_node_hops'), + label_edge_hops=d.get('label_edge_hops'), + label_seeds=d.get('label_seeds', False), to_fixed_point=d['to_fixed_point'] if 'to_fixed_point' in d else DEFAULT_FIXED_POINT, source_node_match=maybe_filter_dict_from_json(d, 'source_node_match'), destination_node_match=maybe_filter_dict_from_json(d, 'destination_node_match'), @@ -614,6 +789,13 @@ def __init__( self, edge_match: Optional[dict] = DEFAULT_FILTER_DICT, hops: Optional[int] = DEFAULT_HOPS, + min_hops: Optional[int] = None, + max_hops: Optional[int] = None, + output_min_hops: Optional[int] = None, + output_max_hops: Optional[int] = None, + label_node_hops: Optional[str] = None, + label_edge_hops: Optional[str] = None, + label_seeds: bool = False, source_node_match: Optional[dict] = DEFAULT_FILTER_DICT, destination_node_match: Optional[dict] = DEFAULT_FILTER_DICT, to_fixed_point: bool = DEFAULT_FIXED_POINT, @@ -626,6 +808,13 @@ def __init__( direction='undirected', edge_match=edge_match, hops=hops, + min_hops=min_hops, + max_hops=max_hops, + output_min_hops=output_min_hops, + output_max_hops=output_max_hops, + label_node_hops=label_node_hops, + label_edge_hops=label_edge_hops, + label_seeds=label_seeds, source_node_match=source_node_match, destination_node_match=destination_node_match, to_fixed_point=to_fixed_point, @@ -640,6 +829,13 @@ def from_json(cls, d: dict, validate: bool = True) -> 'ASTEdge': out = ASTEdgeUndirected( edge_match=maybe_filter_dict_from_json(d, 'edge_match'), hops=d['hops'] if 'hops' in d else None, + min_hops=d.get('min_hops'), + max_hops=d.get('max_hops'), + output_min_hops=d.get('output_min_hops'), + output_max_hops=d.get('output_max_hops'), + label_node_hops=d.get('label_node_hops'), + label_edge_hops=d.get('label_edge_hops'), + label_seeds=d.get('label_seeds', False), to_fixed_point=d['to_fixed_point'] if 'to_fixed_point' in d else DEFAULT_FIXED_POINT, source_node_match=maybe_filter_dict_from_json(d, 'source_node_match'), destination_node_match=maybe_filter_dict_from_json(d, 'destination_node_match'), diff --git a/graphistry/compute/chain.py b/graphistry/compute/chain.py index ba29e9c0cf..7a11c4edc3 100644 --- a/graphistry/compute/chain.py +++ b/graphistry/compute/chain.py @@ -1,5 +1,6 @@ import logging -from typing import Dict, Union, cast, List, Tuple, Optional, TYPE_CHECKING, Callable, Any +import pandas as pd +from typing import Dict, Union, cast, List, Tuple, Optional, TYPE_CHECKING from graphistry.Engine import Engine, EngineAbstract, df_concat, df_to_engine, resolve_engine from graphistry.Plottable import Plottable @@ -153,7 +154,13 @@ def validate_schema(self, g: Plottable, collect_all: bool = False) -> Optional[L ############################################################################### -def combine_steps(g: Plottable, kind: str, steps: List[Tuple[ASTObject,Plottable]], engine: Engine) -> DataFrameT: +def combine_steps( + g: Plottable, + kind: str, + steps: List[Tuple[ASTObject, Plottable]], + engine: Engine, + label_steps: Optional[List[Tuple[ASTObject, Plottable]]] = None +) -> DataFrameT: """ Collect nodes and edges, taking care to deduplicate and tag any names """ @@ -168,27 +175,81 @@ def combine_steps(g: Plottable, kind: str, steps: List[Tuple[ASTObject,Plottable logger.debug('combine_steps ops pre: %s', [op for (op, _) in steps]) if kind == 'edges': logger.debug('EDGES << recompute forwards given reduced set') - steps = [ - ( + node_id = getattr(g, '_node') + full_nodes = getattr(g, '_nodes', None) + + # For edges, we need to re-run forward ops but use the PREVIOUS forward step's nodes + # as prev_node_wavefront (not the current reverse step's nodes which include + # nodes reached during reverse traversal). + new_steps = [] + for idx, (op, g_step) in enumerate(steps): + # Get prev_node_wavefront from the previous forward step (label_steps), not reverse result + if label_steps is not None and idx > 0: + prev_fwd_step = label_steps[idx - 1][1] + prev_wavefront_source = prev_fwd_step._nodes + else: + prev_wavefront_source = g_step._nodes + + prev_node_wavefront = ( + safe_merge( + full_nodes, + prev_wavefront_source[[node_id]], + on=node_id, + how='inner', + engine=engine, + ) if full_nodes is not None and node_id is not None and prev_wavefront_source is not None else prev_wavefront_source + ) + + new_steps.append(( op, # forward op op( g=g.edges(g_step._edges), # transition via any found edge - prev_node_wavefront=g_step._nodes, # start from where backwards step says is reachable - - #target_wave_front=steps[i+1][1]._nodes # end at where next backwards step says is reachable - target_wave_front=None, # ^^^ optimization: valid transitions already limit to known-good ones + prev_node_wavefront=prev_node_wavefront, + target_wave_front=None, engine=engine ) - ) - for (op, g_step) in steps - ] + )) + steps = new_steps logger.debug('-----------[ combine %s ---------------]', kind) # df[[id]] - with defensive checks for column existence + if label_steps is None: + label_steps = steps + + def apply_output_slice(op: ASTObject, op_label: ASTObject, df): + if not isinstance(op_label, ASTEdge): + return df + out_min = getattr(op, 'output_min_hops', None) + out_max = getattr(op, 'output_max_hops', None) + # Fall back to forward op (with labels) when reverse op drops slice info + if out_min is None and out_max is None: + out_min = getattr(op_label, 'output_min_hops', None) + out_max = getattr(op_label, 'output_max_hops', None) + if out_min is None and out_max is None: + return df + label_col = op_label.label_node_hops if kind == 'nodes' else op_label.label_edge_hops + if label_col is None: + # best-effort fallback to any hop-like column + hop_like = [c for c in df.columns if 'hop' in c] + if not hop_like: + return df + label_col = hop_like[0] + if label_col not in df.columns: + return df + mask = df[label_col].notna() + if out_min is not None: + mask = mask & (df[label_col] >= out_min) + if out_max is not None: + mask = mask & (df[label_col] <= out_max) + return df[mask] + dfs_to_concat = [] - for (op, g_step) in steps: - step_df = getattr(g_step, df_fld) + extra_step_dfs = [] + base_cols = set(getattr(g, df_fld).columns) + for idx, (op, g_step) in enumerate(steps): + op_label = label_steps[idx][0] if idx < len(label_steps) else op + step_df = apply_output_slice(op, op_label, getattr(g_step, df_fld)) if id not in step_df.columns: step_id = getattr(g_step, '_node' if kind == 'nodes' else '_edge') raise ValueError(f"Column '{id}' not found in {kind} step DataFrame. " @@ -196,6 +257,14 @@ def combine_steps(g: Plottable, kind: str, steps: List[Tuple[ASTObject,Plottable f"Operation: {op}") dfs_to_concat.append(step_df[[id]]) + for _, (_, g_step) in enumerate(label_steps): + step_df = getattr(g_step, df_fld) + if id not in step_df.columns: + continue + extra_cols = [c for c in step_df.columns if c != id and c not in base_cols and 'hop' in c] + if extra_cols: + extra_step_dfs.append(step_df[[id] + extra_cols]) + # Honor user's engine request by converting DataFrames to match requested engine # This ensures API contract: engine parameter guarantees output DataFrame type if len(dfs_to_concat) > 0: @@ -206,6 +275,46 @@ def combine_steps(g: Plottable, kind: str, steps: List[Tuple[ASTObject,Plottable concat = df_concat(engine) out_df = concat(dfs_to_concat).drop_duplicates(subset=[id]) + + # Merge through any additional columns produced by steps (e.g., hop labels) + label_cols = set() + for step_df in extra_step_dfs: + if len(step_df.columns) <= 1: # only id column + continue + label_cols.update([c for c in step_df.columns if c != id]) + out_df = safe_merge(out_df, step_df, on=id, how='left', engine=engine) + for col in step_df.columns: + if col == id: + continue + col_x, col_y = f'{col}_x', f'{col}_y' + if col_x in out_df.columns and col_y in out_df.columns: + out_df[col] = out_df[col_x].fillna(out_df[col_y]) + out_df = out_df.drop(columns=[col_x, col_y]) + + # Final post-filter: apply output slice to the combined result + for idx, (op, _) in enumerate(steps): + op_label = label_steps[idx][0] if idx < len(label_steps) else op + if isinstance(op, ASTEdge): + out_df = apply_output_slice(op, op_label, out_df) + + # If hop labels requested and seeds should be labeled, add hop 0 for seeds missing labels + if kind == 'nodes' and label_cols: + label_seeds_requested = any(isinstance(op, ASTEdge) and getattr(op, 'label_seeds', False) for op, _ in label_steps) + if label_seeds_requested and label_steps: + seed_df = getattr(label_steps[0][1], df_fld) + if seed_df is not None and id in seed_df.columns: + seed_ids = seed_df[[id]].drop_duplicates() + # align engines defensively + if resolve_engine(EngineAbstract.AUTO, seed_ids) != resolve_engine(EngineAbstract.AUTO, out_df): + seed_ids = df_to_engine(seed_ids, resolve_engine(EngineAbstract.AUTO, out_df)) + try: + seed_mask = out_df[id].isin(seed_ids[id]) + except Exception: + seed_mask = None + if seed_mask is not None: + for col in label_cols: + if col in out_df.columns: + out_df.loc[seed_mask, col] = out_df.loc[seed_mask, col].fillna(0) if logger.isEnabledFor(logging.DEBUG): for (op, g_step) in steps: if kind == 'edges': @@ -215,14 +324,42 @@ def combine_steps(g: Plottable, kind: str, steps: List[Tuple[ASTObject,Plottable # df[[id, op_name1, ...]] logger.debug('combine_steps ops: %s', [op for (op, _) in steps]) - for (op, g_step) in steps: + for idx, (op, g_step) in enumerate(steps): if op._name is not None and isinstance(op, op_type): logger.debug('tagging kind [%s] name %s', op_type, op._name) step_df = getattr(g_step, df_fld)[[id, op._name]] # Use safe_merge to handle engine type coercion automatically out_df = safe_merge(out_df, step_df, on=id, how='left', engine=engine) - s = out_df[op._name] - out_df[op._name] = s.where(s.notna(), False).astype('bool') + # Collapse any merge suffixes introduced by repeated tags + x_name, y_name = f'{op._name}_x', f'{op._name}_y' + if x_name in out_df.columns and y_name in out_df.columns: + out_df[op._name] = out_df[x_name].fillna(out_df[y_name]) + out_df = out_df.drop(columns=[x_name, y_name]) + out_df[op._name] = out_df[op._name].fillna(False).astype('bool') + + # Restrict node aliases to endpoints that actually fed the next edge step + if kind == 'nodes' and idx + 1 < len(steps): + next_op, next_step = steps[idx + 1] + if isinstance(next_op, ASTEdge): + allowed_ids = None + try: + if next_op.direction == 'forward': + allowed_ids = next_step._edges[next_step._source] + elif next_op.direction == 'reverse': + allowed_ids = next_step._edges[next_step._destination] + else: # undirected + allowed_ids = pd.concat( + [ + next_step._edges[next_step._source], + next_step._edges[next_step._destination], + ], + ignore_index=True, + ) + except Exception: + allowed_ids = None + + if allowed_ids is not None and id in out_df.columns: + out_df[op._name] = out_df[op._name] & out_df[id].isin(allowed_ids) # Use safe_merge for final merge with automatic engine type coercion g_df = getattr(g, df_fld) @@ -230,6 +367,62 @@ def combine_steps(g: Plottable, kind: str, steps: List[Tuple[ASTObject,Plottable logger.debug('COMBINED[%s] >>\n%s', kind, out_df) + # Handle seed labeling toggles after slicing + if kind == 'nodes' and label_cols: + seeds_df = label_steps[0][1]._nodes if label_steps and label_steps[0][1]._nodes is not None else None + seed_ids = seeds_df[[id]].drop_duplicates() if seeds_df is not None and id in seeds_df.columns else None + label_seeds_true = any(isinstance(op, ASTEdge) and getattr(op, 'label_seeds', False) for op, _ in label_steps) + if seed_ids is not None: + if label_seeds_true: + # Ensure seeds are present and labeled 0 + seeds_with_labels = seed_ids.copy() + for col in label_cols: + if col in out_df.columns: + seeds_with_labels[col] = 0 + out_df = safe_merge(out_df, seeds_with_labels, on=id, how='outer', engine=engine) + else: + # Clear seed labels when label_seeds=False + if id in out_df.columns: + mask = out_df[id].isin(seed_ids[id]) + for col in label_cols: + if col in out_df.columns: + out_df.loc[mask, col] = pd.NA + # Backfill missing hop labels from forward label steps + hop_cols = [c for c in out_df.columns if 'hop' in c] + if hop_cols: + hop_maps = [] + for _, g_step in label_steps: + step_df = getattr(g_step, df_fld) + if id in step_df.columns: + for hc in hop_cols: + if hc in step_df.columns: + hop_maps.append(step_df[[id, hc]]) + hop_maps = [df for df in hop_maps if len(df) > 0] + if hop_maps: + hop_map_df = df_to_engine(df_concat(engine)(hop_maps), resolve_engine(EngineAbstract.AUTO, hop_maps[0])) + for hc in hop_cols: + if hc in hop_map_df.columns: + hop_map = hop_map_df[[id, hc]].dropna(subset=[hc]).drop_duplicates(subset=[id]).set_index(id)[hc] + out_df[hc] = out_df[hc].combine_first(out_df[id].map(hop_map)) + + # Collapse merge suffixes (_x/_y) into a single column + cols = list(out_df.columns) + for c in cols: + if c.endswith('_x'): + base = c[:-2] + c_y = base + '_y' + if c_y in out_df.columns: + if len(out_df) > 0: + out_df[base] = out_df[c].where(out_df[c].notna(), out_df[c_y]) + out_df = out_df.drop(columns=[c, c_y]) + elif c.endswith('_y'): + base = c[:-2] + c_x = base + '_x' + if c_x in out_df.columns: + if len(out_df) > 0: + out_df[base] = out_df[c_x].where(out_df[c_x].notna(), out_df[c]) + out_df = out_df.drop(columns=[c, c_x]) + return out_df @@ -722,6 +915,25 @@ def _chain_impl(self: Plottable, ops: Union[List[ASTObject], Chain], engine: Uni prev_orig_step = None else: prev_orig_step = g_stack[-(len(g_stack_reverse) + 2)] + # Reattach node attributes for reverse wavefronts so downstream matches work + prev_wavefront_nodes = prev_loop_step._nodes + if g._node is not None and prev_wavefront_nodes is not None and g._nodes is not None: + prev_wavefront_nodes = safe_merge( + g._nodes, + prev_wavefront_nodes[[g._node]], + on=g._node, + how='inner', + engine=engine_concrete + ) + target_wave_front_nodes = prev_orig_step._nodes if prev_orig_step is not None else None + if g._node is not None and target_wave_front_nodes is not None and g._nodes is not None: + target_wave_front_nodes = safe_merge( + g._nodes, + target_wave_front_nodes[[g._node]], + on=g._node, + how='inner', + engine=engine_concrete + ) assert prev_loop_step._nodes is not None g_step_reverse = ( (op.reverse())( @@ -732,10 +944,10 @@ def _chain_impl(self: Plottable, ops: Union[List[ASTObject], Chain], engine: Uni # check for hits against fully valid targets # ast will replace g.node() with this as its starting points - prev_node_wavefront=prev_loop_step._nodes, + prev_node_wavefront=prev_wavefront_nodes, # only allow transitions to these nodes (vs prev_node_wavefront) - target_wave_front=prev_orig_step._nodes if prev_orig_step is not None else None, + target_wave_front=target_wave_front_nodes, engine=engine_concrete ) @@ -750,10 +962,22 @@ def _chain_impl(self: Plottable, ops: Union[List[ASTObject], Chain], engine: Uni logger.debug('edges: %s', g_step._edges) logger.debug('============ COMBINE NODES ============') - final_nodes_df = combine_steps(g, 'nodes', list(zip(ops, reversed(g_stack_reverse))), engine_concrete) + final_nodes_df = combine_steps( + g, + 'nodes', + list(zip(ops, reversed(g_stack_reverse))), + engine_concrete, + label_steps=list(zip(ops, g_stack)) + ) logger.debug('============ COMBINE EDGES ============') - final_edges_df = combine_steps(g, 'edges', list(zip(ops, reversed(g_stack_reverse))), engine_concrete) + final_edges_df = combine_steps( + g, + 'edges', + list(zip(ops, reversed(g_stack_reverse))), + engine_concrete, + label_steps=list(zip(ops, g_stack)) + ) if added_edge_index: # Drop the internal edge index column (stored in g._edge after we added it) final_edges_df = final_edges_df.drop(columns=[g._edge]) @@ -762,6 +986,23 @@ def _chain_impl(self: Plottable, ops: Union[List[ASTObject], Chain], engine: Uni else: g_out = g.nodes(final_nodes_df).edges(final_edges_df) + # Ensure node set covers edge endpoints after any output slicing + if g_out._edges is not None and len(g_out._edges) > 0: + concat_fn = df_concat(engine_concrete) + endpoints = concat_fn( + [ + g_out._edges[[g_out._source]].rename(columns={g_out._source: g_out._node}), + g_out._edges[[g_out._destination]].rename(columns={g_out._destination: g_out._node}), + ], + ignore_index=True, + sort=False, + ).drop_duplicates(subset=[g_out._node]) + if resolve_engine(EngineAbstract.AUTO, endpoints) != resolve_engine(EngineAbstract.AUTO, g_out._nodes): + endpoints = df_to_engine(endpoints, resolve_engine(EngineAbstract.AUTO, g_out._nodes)) + g_out = g_out.nodes( + concat_fn([g_out._nodes, endpoints], ignore_index=True, sort=False).drop_duplicates(subset=[g_out._node]) + ) + # Mark as successful success = True diff --git a/graphistry/compute/gfql/call_safelist.py b/graphistry/compute/gfql/call_safelist.py index 4259a9058b..357c7e0633 100644 --- a/graphistry/compute/gfql/call_safelist.py +++ b/graphistry/compute/gfql/call_safelist.py @@ -47,6 +47,9 @@ def is_int(v: Any) -> bool: def is_bool(v: Any) -> bool: return isinstance(v, bool) +def is_int_or_none(v: Any) -> bool: + return v is None or isinstance(v, int) + def is_dict(v: Any) -> bool: return isinstance(v, dict) @@ -219,11 +222,20 @@ def validate_hypergraph_opts(v: Any) -> bool: 'nodes', 'hops', 'to_fixed_point', 'direction', 'source_node_match', 'edge_match', 'destination_node_match', 'source_node_query', 'edge_query', 'destination_node_query', - 'return_as_wave_front', 'target_wave_front', 'engine' + 'return_as_wave_front', 'target_wave_front', 'engine', + 'min_hops', 'max_hops', 'output_min_hops', 'output_max_hops', + 'label_node_hops', 'label_edge_hops', 'label_seeds' }, 'required_params': set(), 'param_validators': { 'hops': is_int, + 'min_hops': is_int, + 'max_hops': is_int, + 'output_min_hops': is_int_or_none, + 'output_max_hops': is_int_or_none, + 'label_node_hops': is_string_or_none, + 'label_edge_hops': is_string_or_none, + 'label_seeds': is_bool, 'to_fixed_point': is_bool, 'direction': lambda v: v in ['forward', 'reverse', 'undirected'], 'source_node_match': is_dict, @@ -235,7 +247,7 @@ def validate_hypergraph_opts(v: Any) -> bool: 'return_as_wave_front': is_bool, 'engine': is_string }, - 'description': 'Traverse graph by following edges' + 'description': 'Traverse edges with optional hop bounds and node/edge hop label columns' }, # In/out degree methods diff --git a/graphistry/compute/hop.py b/graphistry/compute/hop.py index 80e68e7f02..f21db09526 100644 --- a/graphistry/compute/hop.py +++ b/graphistry/compute/hop.py @@ -5,6 +5,7 @@ """ import logging from typing import List, Optional, Tuple, TYPE_CHECKING, Union +import pandas as pd from graphistry.Engine import ( EngineAbstract, df_concat, df_cons, df_to_engine, resolve_engine @@ -238,6 +239,14 @@ def process_hop_direction( def hop(self: Plottable, nodes: Optional[DataFrameT] = None, # chain: incoming wavefront hops: Optional[int] = 1, + *, + min_hops: Optional[int] = None, + max_hops: Optional[int] = None, + output_min_hops: Optional[int] = None, + output_max_hops: Optional[int] = None, + label_node_hops: Optional[str] = None, + label_edge_hops: Optional[str] = None, + label_seeds: bool = False, to_fixed_point: bool = False, direction: str = 'forward', edge_match: Optional[dict] = None, @@ -259,7 +268,11 @@ def hop(self: Plottable, g: Plotter nodes: dataframe with id column matching g._node. None signifies all nodes (default). - hops: consider paths of length 1 to 'hops' steps, if any (default 1). + hops: consider paths of length 1 to 'hops' steps, if any (default 1). Shorthand for max_hops. + min_hops/max_hops: inclusive traversal bounds; defaults preserve legacy behavior (min=1 unless max=0; max defaults to hops). + output_min_hops/output_max_hops: optional output slice applied after traversal; defaults keep all traversed hops up to max_hops. Useful for showing a subrange (e.g., min/max = 2..4 but display only hops 3..4). + label_node_hops/label_edge_hops: optional column names for hop numbers (omit or None to skip). Nodes record the first hop step they are reached (1 = first expansion); edges record the hop step that traversed them. + label_seeds: when True and labeling, also write hop 0 for seed nodes in the node label column. to_fixed_point: keep hopping until no new nodes are found (ignores hops) direction: 'forward', 'reverse', 'undirected' edge_match: dict of kv-pairs to exact match (see also: filter_edges_by_dict) @@ -284,6 +297,12 @@ def hop(self: Plottable, if isinstance(engine, str): engine = EngineAbstract(engine) + def _combine_first_no_warn(target, fill): + """Avoid pandas concat warning when combine_first sees empty inputs.""" + if target is None or len(target) == 0: + return target + return target.where(target.notna(), fill) + engine_concrete = resolve_engine(engine, self) if not TYPE_CHECKING: DataFrameT = df_cons(engine_concrete) @@ -318,15 +337,57 @@ def hop(self: Plottable, logger.debug('engine_concrete: %s', engine_concrete) logger.debug('---------------------') - if not to_fixed_point and not isinstance(hops, int): - raise ValueError(f'Must provide hops int when to_fixed_point is False, received: {hops}') - if direction not in ['forward', 'reverse', 'undirected']: raise ValueError(f'Invalid direction: "{direction}", must be one of: "forward" (default), "reverse", "undirected"') if target_wave_front is not None and nodes is None: raise ValueError('target_wave_front requires nodes to target against (for intermediate hops)') + # Resolve hop bounds with legacy compatibility + resolved_max_hops = max_hops if max_hops is not None else hops + resolved_min_hops = min_hops + + if not to_fixed_point: + if resolved_max_hops is not None and not isinstance(resolved_max_hops, int): + raise ValueError(f'Must provide integer hops when to_fixed_point is False, received: {resolved_max_hops}') + else: + resolved_max_hops = None + + if resolved_min_hops is None: + resolved_min_hops = 0 if resolved_max_hops == 0 else 1 + + if resolved_min_hops < 0: + raise ValueError(f'min_hops must be >= 0, received: {resolved_min_hops}') + + if resolved_max_hops is not None and resolved_max_hops < 0: + raise ValueError(f'max_hops must be >= 0, received: {resolved_max_hops}') + + if resolved_max_hops is not None and resolved_min_hops > resolved_max_hops: + raise ValueError(f'min_hops ({resolved_min_hops}) cannot exceed max_hops ({resolved_max_hops})') + + resolved_output_min = output_min_hops + resolved_output_max = output_max_hops + + if resolved_output_min is not None and resolved_output_min < 0: + raise ValueError(f'output_min_hops must be >= 0, received: {resolved_output_min}') + if resolved_output_max is not None and resolved_output_max < 0: + raise ValueError(f'output_max_hops must be >= 0, received: {resolved_output_max}') + if resolved_output_min is not None and resolved_output_max is not None and resolved_output_min > resolved_output_max: + raise ValueError(f'output_min_hops ({resolved_output_min}) cannot exceed output_max_hops ({resolved_output_max})') + + # Default output slice: include all traversed hops unless explicitly post-filtered + if resolved_output_max is None: + resolved_output_max = resolved_max_hops + + # Keep output slice within traversal range if both known + if resolved_output_min is not None and resolved_max_hops is not None and resolved_output_min > resolved_max_hops: + raise ValueError(f'output_min_hops ({resolved_output_min}) cannot exceed max_hops traversal bound ({resolved_max_hops})') + if resolved_output_max is not None and resolved_min_hops is not None and resolved_output_max < resolved_min_hops: + raise ValueError(f'output_max_hops ({resolved_output_max}) cannot be below min_hops traversal bound ({resolved_min_hops})') + + final_output_min = resolved_output_min + final_output_max = resolved_output_max + if destination_node_match == {}: destination_node_match = None @@ -362,7 +423,10 @@ def hop(self: Plottable, if debugging_hop and logger.isEnabledFor(logging.DEBUG): logger.debug('Node column conflicts with destination column, using temp name: %s', TEMP_DST_COL) - starting_nodes = nodes if nodes is not None else g2._nodes + seeds_provided = nodes is not None + starting_nodes = nodes if seeds_provided else g2._nodes + if starting_nodes is None: + raise ValueError('hop requires a node DataFrame; starting_nodes is None') if g2._edge is None: # Get the pre-filtered edges @@ -386,7 +450,39 @@ def hop(self: Plottable, if EDGE_ID not in edges_indexed.columns: raise ValueError(f"Edge binding column '{EDGE_ID}' (from g._edge='{g2._edge}') not found in edges. Available columns: {list(edges_indexed.columns)}") - hops_remaining = hops + def resolve_label_col(requested: Optional[str], df, default_base: str) -> Optional[str]: + if requested is None: + return generate_safe_column_name(default_base, df, prefix='__gfqlhop_', suffix='__') + if requested not in df.columns: + return requested + counter = 1 + candidate = f"{requested}_{counter}" + while candidate in df.columns: + counter = counter + 1 + candidate = f"{requested}_{counter}" + return candidate + + # Track hops when needed for labels, output slices, or min_hops pruning + needs_min_hop_pruning = resolved_min_hops is not None and resolved_min_hops > 1 + track_hops = bool( + label_node_hops + or label_edge_hops + or label_seeds + or output_min_hops is not None + or output_max_hops is not None + or needs_min_hop_pruning + ) + track_node_hops = track_hops or bool(label_node_hops or label_seeds) + track_edge_hops = track_hops or label_edge_hops is not None + + edge_hop_col = None + node_hop_col = None + if track_edge_hops: + edge_hop_col = resolve_label_col(label_edge_hops, edges_indexed, '_hop') + seen_edge_marker_col = generate_safe_column_name('__gfql_edge_seen__', edges_indexed, prefix='__seen_', suffix='__') + if track_node_hops: + node_hop_col = resolve_label_col(label_node_hops, g2._nodes, '_hop') + seen_node_marker_col = generate_safe_column_name('__gfql_node_seen__', g2._nodes, prefix='__seen_', suffix='__') wave_front = starting_nodes[[g2._node]][:0] @@ -400,6 +496,13 @@ def hop(self: Plottable, base_target_nodes = concat([target_wave_front, g2._nodes], ignore_index=True, sort=False).drop_duplicates(subset=[g2._node]) #TODO precompute src/dst match subset if multihop? + node_hop_records = None + edge_hop_records = None + + if track_node_hops and label_seeds and node_hop_col is not None: + seed_nodes = starting_nodes[[g2._node]].drop_duplicates() + node_hop_records = seed_nodes.assign(**{node_hop_col: 0}) + if debugging_hop and logger.isEnabledFor(logging.DEBUG): logger.debug('~~~~~~~~~~ LOOP PRE ~~~~~~~~~~~') logger.debug('starting_nodes:\n%s', starting_nodes) @@ -410,11 +513,18 @@ def hop(self: Plottable, first_iter = True combined_node_ids = None + current_hop = 0 + max_reached_hop = 0 while True: + if not to_fixed_point and resolved_max_hops is not None and current_hop >= resolved_max_hops: + break + + current_hop += 1 + if debugging_hop and logger.isEnabledFor(logging.DEBUG): logger.debug('~~~~~~~~~~ LOOP STEP BEGIN ~~~~~~~~~~~') - logger.debug('hops_remaining: %s', hops_remaining) + logger.debug('current_hop: %s', current_hop) logger.debug('wave_front:\n%s', wave_front) logger.debug('matches_nodes:\n%s', matches_nodes) logger.debug('matches_edges:\n%s', matches_edges) @@ -429,11 +539,6 @@ def hop(self: Plottable, safe_merge(wave_front, self._nodes, on=g2._node, how='left'), ) - if not to_fixed_point and hops_remaining is not None: - if hops_remaining < 1: - break - hops_remaining = hops_remaining - 1 - assert len(wave_front.columns) == 1, "just indexes" wave_front_iter : DataFrameT = query_if_not_none( source_node_query, @@ -455,7 +560,8 @@ def hop(self: Plottable, intermediate_target_wave_front = None if target_wave_front is not None: # Calculate this once for both directions - if hops_remaining: + has_more_hops_planned = to_fixed_point or resolved_max_hops is None or current_hop < resolved_max_hops + if has_more_hops_planned: intermediate_target_wave_front = concat([ target_wave_front[[g2._node]], self._nodes[[g2._node]] @@ -526,6 +632,61 @@ def hop(self: Plottable, + ( [ new_node_ids_reverse] if new_node_ids_reverse is not None else mt ), # noqa: W503 ignore_index=True, sort=False).drop_duplicates() + if len(new_node_ids) > 0: + max_reached_hop = current_hop + + if track_edge_hops and edge_hop_col is not None: + assert seen_edge_marker_col is not None + edge_label_candidates : List[DataFrameT] = [] + if hop_edges_forward is not None: + edge_label_candidates.append(hop_edges_forward[[EDGE_ID]]) + if hop_edges_reverse is not None: + edge_label_candidates.append(hop_edges_reverse[[EDGE_ID]]) + + for edge_df_iter in edge_label_candidates: + if len(edge_df_iter) == 0: + continue + labeled_edges = edge_df_iter.assign(**{edge_hop_col: current_hop}) + if edge_hop_records is None: + edge_hop_records = labeled_edges + else: + edge_seen = edge_hop_records[[EDGE_ID]].assign(**{seen_edge_marker_col: 1}) + merged_edge_labels = safe_merge( + labeled_edges, + edge_seen, + on=EDGE_ID, + how='left', + engine=engine_concrete + ) + new_edge_labels = merged_edge_labels[merged_edge_labels[seen_edge_marker_col].isna()].drop(columns=[seen_edge_marker_col]) + if len(new_edge_labels) > 0: + edge_hop_records = concat( + [edge_hop_records, new_edge_labels], + ignore_index=True, + sort=False + ).drop_duplicates(subset=[EDGE_ID]) + + if track_node_hops and node_hop_col is not None: + assert seen_node_marker_col is not None + if node_hop_records is None: + node_hop_records = new_node_ids.assign(**{node_hop_col: current_hop}) + else: + node_seen = node_hop_records[[g2._node]].assign(**{seen_node_marker_col: 1}) + merged_node_labels = safe_merge( + new_node_ids, + node_seen, + on=g2._node, + how='left', + engine=engine_concrete + ) + new_node_labels = merged_node_labels[merged_node_labels[seen_node_marker_col].isna()].drop(columns=[seen_node_marker_col]) + if len(new_node_labels) > 0: + node_hop_records = concat( + [node_hop_records, new_node_labels.assign(**{node_hop_col: current_hop})], + ignore_index=True, + sort=False + ).drop_duplicates(subset=[g2._node]) + if debugging_hop and logger.isEnabledFor(logging.DEBUG): logger.debug('~~~~~~~~~~ LOOP STEP MERGES 1 ~~~~~~~~~~~') logger.debug('matches_edges:\n%s', matches_edges) @@ -583,8 +744,105 @@ def hop(self: Plottable, logger.debug('nodes (init):\n%s', nodes) logger.debug('target_wave_front:\n%s', target_wave_front) + if resolved_min_hops is not None and max_reached_hop < resolved_min_hops: + matches_nodes = starting_nodes[[g2._node]][:0] + matches_edges = edges_indexed[[EDGE_ID]][:0] + if node_hop_records is not None: + node_hop_records = node_hop_records[:0] + if edge_hop_records is not None: + edge_hop_records = edge_hop_records[:0] + + # Prune dead-end branches that don't reach min_hops + # When min_hops > 1, only keep edges/nodes on paths that reach at least min_hops + if ( + resolved_min_hops is not None + and resolved_min_hops > 1 + and node_hop_records is not None + and edge_hop_records is not None + and node_hop_col is not None + and edge_hop_col is not None + and max_reached_hop >= resolved_min_hops + ): + # Find goal nodes (nodes at hop >= min_hops) + goal_nodes = set( + node_hop_records[node_hop_records[node_hop_col] >= resolved_min_hops][g2._node].tolist() + ) + + if goal_nodes: + # Backtrack from goal nodes to find all edges/nodes on valid paths + # We need to traverse backwards through the edge records to find which edges lead to goals + edge_records_with_endpoints = safe_merge( + edge_hop_records, + edges_indexed[[EDGE_ID, g2._source, g2._destination]], + on=EDGE_ID, + how='inner' + ) + + # Build sets of valid nodes and edges by backtracking from goal nodes + valid_nodes = set(goal_nodes) + valid_edges = set() + + # Start with edges that lead TO goal nodes + current_targets = goal_nodes + + # Backtrack through hops from max edge hop down to 1 + # Use actual max edge hop, not max_reached_hop which may include extra traversal steps + max_edge_hop = int(edge_hop_records[edge_hop_col].max()) if len(edge_hop_records) > 0 else max_reached_hop + for hop_level in range(max_edge_hop, 0, -1): + # Find edges at this hop level that reach current targets + hop_edges = edge_records_with_endpoints[ + edge_records_with_endpoints[edge_hop_col] == hop_level + ] + + if direction == 'forward': + # Forward: edges go src->dst, so dst should be in targets + reaching_edges = hop_edges[hop_edges[g2._destination].isin(current_targets)] + new_sources = set(reaching_edges[g2._source].tolist()) + elif direction == 'reverse': + # Reverse: edges go dst->src conceptually, so src should be in targets + reaching_edges = hop_edges[hop_edges[g2._source].isin(current_targets)] + new_sources = set(reaching_edges[g2._destination].tolist()) + else: + # Undirected: either endpoint could be in targets + reaching_fwd = hop_edges[hop_edges[g2._destination].isin(current_targets)] + reaching_rev = hop_edges[hop_edges[g2._source].isin(current_targets)] + reaching_edges = concat([reaching_fwd, reaching_rev], ignore_index=True, sort=False).drop_duplicates(subset=[EDGE_ID]) + new_sources = set(reaching_fwd[g2._source].tolist()) | set(reaching_rev[g2._destination].tolist()) + + valid_edges.update(reaching_edges[EDGE_ID].tolist()) + valid_nodes.update(new_sources) + current_targets = new_sources + # Filter records to only valid paths + edge_hop_records = edge_hop_records[edge_hop_records[EDGE_ID].isin(valid_edges)] + node_hop_records = node_hop_records[node_hop_records[g2._node].isin(valid_nodes)] + matches_edges = matches_edges[matches_edges[EDGE_ID].isin(valid_edges)] + if matches_nodes is not None: + matches_nodes = matches_nodes[matches_nodes[g2._node].isin(valid_nodes)] + #hydrate edges - final_edges = safe_merge(edges_indexed, matches_edges, on=EDGE_ID, how='inner') + if track_edge_hops and edge_hop_col is not None: + edge_labels_source = edge_hop_records + if edge_labels_source is None: + edge_labels_source = edges_indexed[[EDGE_ID]][:0].assign(**{edge_hop_col: []}) + + edge_mask = None + if final_output_min is not None: + edge_mask = edge_labels_source[edge_hop_col] >= final_output_min + if final_output_max is not None: + max_mask = edge_labels_source[edge_hop_col] <= final_output_max + edge_mask = max_mask if edge_mask is None else edge_mask & max_mask + + if edge_mask is not None: + edge_labels_source = edge_labels_source[edge_mask] + + final_edges = safe_merge(edges_indexed, edge_labels_source, on=EDGE_ID, how='inner') + if label_edge_hops is None and edge_hop_col in final_edges: + # Preserve hop labels when output slicing is requested so callers can filter + if output_min_hops is None and output_max_hops is None: + final_edges = final_edges.drop(columns=[edge_hop_col]) + else: + final_edges = safe_merge(edges_indexed, matches_edges, on=EDGE_ID, how='inner') + if EDGE_ID not in self._edges: final_edges = final_edges.drop(columns=[EDGE_ID]) g_out = g2.edges(final_edges) @@ -592,11 +850,6 @@ def hop(self: Plottable, #hydrate nodes if self._nodes is not None: logger.debug('~~~~~~~~~~ NODES HYDRATION ~~~~~~~~~~~') - #FIXME what was this for? Removed for shortest-path reverse pass fixes - #if target_wave_front is not None: - # rich_nodes = target_wave_front - #else: - # rich_nodes = self._nodes rich_nodes = self._nodes if target_wave_front is not None: rich_nodes = concat([rich_nodes, target_wave_front], ignore_index=True, sort=False).drop_duplicates(subset=[g2._node]) @@ -605,13 +858,219 @@ def hop(self: Plottable, logger.debug('matches_nodes:\n%s', matches_nodes) logger.debug('wave_front:\n%s', wave_front) logger.debug('self._nodes:\n%s', self._nodes) - final_nodes = safe_merge( - rich_nodes, - matches_nodes if matches_nodes is not None else wave_front[:0], - on=self._node, - how='inner') + + base_nodes = matches_nodes if matches_nodes is not None else wave_front[:0] + + if track_node_hops and node_hop_col is not None: + node_labels_source = node_hop_records + if node_labels_source is None: + node_labels_source = base_nodes.assign(**{node_hop_col: []}) + + node_labels_source = node_labels_source.copy() + unfiltered_node_labels_source = node_labels_source.copy() + node_mask = None + if final_output_min is not None: + node_mask = node_labels_source[node_hop_col] >= final_output_min + if final_output_max is not None: + max_node_mask = node_labels_source[node_hop_col] <= final_output_max + node_mask = max_node_mask if node_mask is None else node_mask & max_node_mask + + if node_mask is not None: + node_labels_source.loc[~node_mask, node_hop_col] = pd.NA + + if label_seeds: + if node_hop_records is not None: + seed_rows = node_hop_records[node_hop_col] == 0 + if seed_rows.any(): + seeds_for_output = node_hop_records[seed_rows] + node_labels_source = concat( + [node_labels_source, seeds_for_output], + ignore_index=True, + sort=False + ).drop_duplicates(subset=[g2._node]) + elif starting_nodes is not None and g2._node in starting_nodes.columns: + seed_nodes = starting_nodes[[g2._node]].drop_duplicates() + node_labels_source = concat( + [node_labels_source, seed_nodes.assign(**{node_hop_col: 0})], + ignore_index=True, + sort=False + ).drop_duplicates(subset=[g2._node]) + + filtered_nodes = safe_merge( + base_nodes, + node_labels_source[[g2._node]], + on=g2._node, + how='inner') + + final_nodes = safe_merge( + rich_nodes, + filtered_nodes, + on=self._node, + how='inner') + + final_nodes = safe_merge( + final_nodes, + node_labels_source, + on=g2._node, + how='left') + + if node_hop_col in final_nodes and unfiltered_node_labels_source is not None: + fallback_map = ( + unfiltered_node_labels_source[[g2._node, node_hop_col]] + .drop_duplicates(subset=[g2._node]) + .set_index(g2._node)[node_hop_col] + ) + try: + final_nodes[node_hop_col] = _combine_first_no_warn( + final_nodes[node_hop_col], + final_nodes[g2._node].map(fallback_map) + ) + except Exception: + pass + + try: + if final_nodes[node_hop_col].notna().all(): + final_nodes[node_hop_col] = final_nodes[node_hop_col].astype('int64') + except Exception: + pass + + if label_node_hops is None and node_hop_col in final_nodes: + final_nodes = final_nodes.drop(columns=[node_hop_col]) + else: + final_nodes = safe_merge( + rich_nodes, + base_nodes, + on=self._node, + how='inner') + g_out = g_out.nodes(final_nodes) + # Ensure all edge endpoints are present in nodes + if g_out._edges is not None and len(g_out._edges) > 0 and g_out._nodes is not None: + endpoints = concat( + [ + g_out._edges[[g_out._source]].rename(columns={g_out._source: g_out._node}), + g_out._edges[[g_out._destination]].rename(columns={g_out._destination: g_out._node}), + ], + ignore_index=True, + sort=False, + ).drop_duplicates(subset=[g_out._node]) + if track_node_hops and node_hop_records is not None and node_hop_col is not None: + endpoints = safe_merge( + endpoints, + node_hop_records[[g_out._node, node_hop_col]].drop_duplicates(subset=[g_out._node]), + on=g_out._node, + how='left' + ) + # Align engine types + if resolve_engine(EngineAbstract.AUTO, endpoints) != resolve_engine(EngineAbstract.AUTO, g_out._nodes): + endpoints = df_to_engine(endpoints, resolve_engine(EngineAbstract.AUTO, g_out._nodes)) + g_out = g_out.nodes( + concat([g_out._nodes, endpoints], ignore_index=True, sort=False).drop_duplicates(subset=[g_out._node]) + ) + + if track_node_hops and node_hop_records is not None and node_hop_col is not None and g_out._nodes is not None: + hop_map = ( + node_hop_records[[g_out._node, node_hop_col]] + .drop_duplicates(subset=[g_out._node]) + .set_index(g_out._node)[node_hop_col] + ) + if g_out._node in g_out._nodes.columns and node_hop_col in g_out._nodes.columns: + try: + mapped = g_out._nodes[g_out._node].map(hop_map) + g_out._nodes[node_hop_col] = g_out._nodes[node_hop_col].where( + g_out._nodes[node_hop_col].notna(), + mapped + ) + except Exception: + pass + seeds_mask = None + if seeds_provided and not label_seeds and starting_nodes is not None and g_out._node in starting_nodes.columns: + seed_ids = starting_nodes[[g_out._node]].drop_duplicates() + seeds_mask = g_out._nodes[g_out._node].isin(seed_ids[g_out._node]) + missing_mask = g_out._nodes[node_hop_col].isna() + if seeds_mask is not None: + missing_mask = missing_mask & ~seeds_mask + if g_out._edges is not None and edge_hop_col is not None and edge_hop_col in g_out._edges.columns: + edge_map_df = concat( + [ + g_out._edges[[g_out._source, edge_hop_col]].rename(columns={g_out._source: g_out._node}), + g_out._edges[[g_out._destination, edge_hop_col]].rename(columns={g_out._destination: g_out._node}), + ], + ignore_index=True, + sort=False, + ) + if len(edge_map_df) > 0: + edge_map = edge_map_df.groupby(g_out._node)[edge_hop_col].min() + else: + edge_map = pd.Series([], dtype='float64') + mapped_edge_hops = g_out._nodes[g_out._node].map(edge_map) + if seeds_mask is not None: + mapped_edge_hops = mapped_edge_hops.mask(seeds_mask) + g_out._nodes[node_hop_col] = _combine_first_no_warn( + g_out._nodes[node_hop_col], + mapped_edge_hops + ) + if missing_mask.any(): + g_out._nodes.loc[missing_mask, node_hop_col] = g_out._nodes.loc[missing_mask, g_out._node].map(edge_map) + if seeds_mask is not None: + zero_seed_mask = seeds_mask & g_out._nodes[node_hop_col].fillna(-1).eq(0) + g_out._nodes.loc[zero_seed_mask, node_hop_col] = pd.NA + try: + g_out._nodes[node_hop_col] = pd.to_numeric(g_out._nodes[node_hop_col], errors='coerce') + if pd.api.types.is_numeric_dtype(g_out._nodes[node_hop_col]): + g_out._nodes[node_hop_col] = g_out._nodes[node_hop_col].astype('Int64') + except Exception: + pass + + if ( + not label_seeds + and seeds_provided + and g_out._nodes is not None + and len(g_out._nodes) > 0 + and node_hop_records is not None + and g_out._node in g_out._nodes.columns + and starting_nodes is not None + and g_out._node in starting_nodes.columns + and node_hop_col is not None + ): + seed_mask_all = g_out._nodes[g_out._node].isin(starting_nodes[g_out._node]) + if direction == 'undirected': + g_out._nodes.loc[seed_mask_all, node_hop_col] = pd.NA + else: + seen_nodes = set(node_hop_records[g_out._node].dropna().tolist()) + seed_ids = starting_nodes[g_out._node].dropna().unique().tolist() + unreached_seed_ids = set(seed_ids) - seen_nodes + if unreached_seed_ids: + mask = g_out._nodes[g_out._node].isin(unreached_seed_ids) + g_out._nodes.loc[mask, node_hop_col] = pd.NA + + if g_out._nodes is not None and (final_output_min is not None or final_output_max is not None): + try: + mask = pd.Series(True, index=g_out._nodes.index) + if node_hop_col is not None and node_hop_col in g_out._nodes.columns: + if final_output_min is not None: + mask = mask & (g_out._nodes[node_hop_col] >= final_output_min) + if final_output_max is not None: + mask = mask & (g_out._nodes[node_hop_col] <= final_output_max) + endpoint_ids = None + if g_out._edges is not None: + endpoint_ids = pd.concat( + [ + g_out._edges[[g_out._source]].rename(columns={g_out._source: g_out._node}), + g_out._edges[[g_out._destination]].rename(columns={g_out._destination: g_out._node}), + ], + ignore_index=True, + sort=False, + ).drop_duplicates(subset=[g_out._node]) + mask = mask | g_out._nodes[g_out._node].isin(endpoint_ids[g_out._node]) + if label_seeds and seeds_provided and starting_nodes is not None and g_out._node in starting_nodes.columns: + seed_ids = starting_nodes[[g_out._node]].drop_duplicates() + mask = mask | g_out._nodes[g_out._node].isin(seed_ids[g_out._node]) + g_out = g_out.nodes(g_out._nodes[mask].drop_duplicates(subset=[g_out._node])) + except Exception: + pass + if debugging_hop and logger.isEnabledFor(logging.DEBUG): logger.debug('~~~~~~~~~~ HOP OUTPUT ~~~~~~~~~~~') logger.debug('nodes:\n%s', g_out._nodes) @@ -619,4 +1078,21 @@ def hop(self: Plottable, logger.debug('======== /HOP =============') logger.debug('==========================') + if ( + return_as_wave_front + and resolved_min_hops is not None + and resolved_min_hops >= 1 + and seeds_provided + and not label_seeds + and g_out._nodes is not None + and starting_nodes is not None + and g_out._node in starting_nodes.columns + ): + seed_ids = starting_nodes[[g_out._node]].drop_duplicates() + seeds_not_reached = seed_ids + if matches_nodes is not None and g_out._node in matches_nodes.columns: + seeds_not_reached = seed_ids[~seed_ids[g_out._node].isin(matches_nodes[g_out._node])] + filtered_nodes = g_out._nodes[~g_out._nodes[g_out._node].isin(seeds_not_reached[g_out._node])] + g_out = g_out.nodes(filtered_nodes) + return g_out diff --git a/graphistry/compute/predicates/str.py b/graphistry/compute/predicates/str.py index cba5dde6ec..27d1015f99 100644 --- a/graphistry/compute/predicates/str.py +++ b/graphistry/compute/predicates/str.py @@ -1,9 +1,17 @@ -from typing import Optional, Union +from typing import Any, Optional, Union + +import pandas as pd from .ASTPredicate import ASTPredicate from graphistry.compute.typing import SeriesT +def _cudf_mask_none(result: Any, mask: Any) -> Any: + result_pd = result.to_pandas().astype('object') + result_pd.iloc[mask] = None + return result_pd + + class Contains(ASTPredicate): def __init__( self, @@ -151,8 +159,6 @@ def __call__(self, s: SeriesT) -> SeriesT: elif not is_cudf and not self.case: # pandas tuple with case-insensitive - need workaround if len(self.pat) == 0: - import pandas as pd - # Create False for all values result = pd.Series([False] * len(s), index=s.index) # Preserve NA values when na=None (default) if self.na is None: @@ -170,7 +176,6 @@ def __call__(self, s: SeriesT) -> SeriesT: # cuDF - need manual OR logic (workaround for bug #20237) if len(self.pat) == 0: import cudf - import pandas as pd # Create False for all values result = cudf.Series([False] * len(s), index=s.index) # Preserve NA values when na=None (default) - match pandas behavior @@ -178,10 +183,9 @@ def __call__(self, s: SeriesT) -> SeriesT: # cuDF bool dtype can't hold None, so check if we need object dtype has_na: bool = bool(s.isna().any()) if has_na: - # Convert to object dtype to preserve None values - result_pd = result.to_pandas().astype('object') # type: ignore[operator] - result_pd[s.to_pandas().isna()] = None - result = cudf.from_pandas(result_pd) + # Convert to object dtype and apply mask to preserve None values + na_mask_arr = s.to_pandas().isna().to_numpy() + result = cudf.from_pandas(_cudf_mask_none(result, na_mask_arr)) else: if not self.case: s_modified = s.str.lower() @@ -321,7 +325,6 @@ def __call__(self, s: SeriesT) -> SeriesT: elif not is_cudf and not self.case: # pandas tuple with case-insensitive - need workaround if len(self.pat) == 0: - import pandas as pd # Create False for all values result = pd.Series([False] * len(s), index=s.index) # Preserve NA values when na=None (default) @@ -340,7 +343,6 @@ def __call__(self, s: SeriesT) -> SeriesT: # cuDF - need manual OR logic (workaround for bug #20237) if len(self.pat) == 0: import cudf - import pandas as pd # Create False for all values result = cudf.Series([False] * len(s), index=s.index) # Preserve NA values when na=None (default) - match pandas behavior @@ -348,10 +350,9 @@ def __call__(self, s: SeriesT) -> SeriesT: # cuDF bool dtype can't hold None, so check if we need object dtype has_na: bool = bool(s.isna().any()) if has_na: - # Convert to object dtype to preserve None values - result_pd = result.to_pandas().astype('object') # type: ignore[operator] - result_pd[s.to_pandas().isna()] = None - result = cudf.from_pandas(result_pd) + # Convert to object dtype and apply mask to preserve None values + na_mask_arr = s.to_pandas().isna().to_numpy() + result = cudf.from_pandas(_cudf_mask_none(result, na_mask_arr)) else: if not self.case: s_modified = s.str.lower() diff --git a/graphistry/gfql/ref/enumerator.py b/graphistry/gfql/ref/enumerator.py index a06f930cbb..ed360565be 100644 --- a/graphistry/gfql/ref/enumerator.py +++ b/graphistry/gfql/ref/enumerator.py @@ -47,6 +47,9 @@ class OracleResult: edges: pd.DataFrame tags: Dict[str, Set[Any]] paths: Optional[List[Dict[str, Any]]] = None + # Hop labels: node_id -> hop_distance, edge_id -> hop_distance + node_hop_labels: Optional[Dict[Any, int]] = None + edge_hop_labels: Optional[Dict[Any, int]] = None def col(alias: str, column: str) -> StepColumnRef: @@ -98,25 +101,61 @@ def enumerate_chain( edge_frame = _build_edge_frame( edges_df, edge_id, edge_src, edge_dst, edge_step, alias_requirements ) - paths = paths.merge( - edge_frame, - left_on=current, - right_on=edge_step["src_col"], - how="inner", - validate="m:m", - ).drop(columns=[edge_step["src_col"]]) - current = edge_step["dst_col"] - node_frame = _build_node_frame(nodes_df, node_id, node_step, alias_requirements) - paths = paths.merge( - node_frame, - left_on=current, - right_on=node_step["id_col"], - how="inner", - validate="m:1", - ) - paths = paths.drop(columns=[current]) - current = node_step["id_col"] + + min_hops = edge_step["min_hops"] + max_hops = edge_step["max_hops"] + if min_hops == 1 and max_hops == 1: + paths = paths.merge( + edge_frame, + left_on=current, + right_on=edge_step["src_col"], + how="inner", + validate="m:m", + ).drop(columns=[edge_step["src_col"]]) + current = edge_step["dst_col"] + + paths = paths.merge( + node_frame, + left_on=current, + right_on=node_step["id_col"], + how="inner", + validate="m:1", + ) + paths = paths.drop(columns=[current]) + current = node_step["id_col"] + else: + if where: + raise ValueError("WHERE clauses not supported for multi-hop edges in enumerator") + if edge_step["alias"] or node_step["alias"]: + # Alias tagging for multi-hop not yet supported in enumerator + raise ValueError("Aliases not supported for multi-hop edges in enumerator") + + dest_allowed: Optional[Set[Any]] = None + if not node_frame.empty: + dest_allowed = set(node_frame[node_step["id_col"]]) + + seeds = paths[current].dropna().tolist() + bp_result = _bounded_paths( + seeds, edge_frame, edge_step, dest_allowed, caps + ) + + # Build new path rows preserving prior columns + new_rows: List[List[Any]] = [] + base_cols = list(paths.columns) + for row in paths.itertuples(index=False, name=None): + row_dict = dict(zip(base_cols, row)) + seed_id = row_dict[current] + for dst in bp_result.seed_to_nodes.get(seed_id, set()): + new_rows.append([*row, dst]) + paths = pd.DataFrame(new_rows, columns=[*base_cols, node_step["id_col"]]) + current = node_step["id_col"] + + # Stash edges/nodes and hop labels for final selection + edge_step["collected_edges"] = bp_result.edges_used + edge_step["collected_nodes"] = bp_result.nodes_used + edge_step["collected_edge_hops"] = bp_result.edge_hops + edge_step["collected_node_hops"] = bp_result.node_hops if where: ready = _ready_where_clauses(paths, where) @@ -139,22 +178,101 @@ def enumerate_chain( else: paths = paths.reset_index(drop=True) - node_ids: Set[Any] = set() - for node_step in node_steps: - node_ids.update(paths[node_step["id_col"]].tolist()) - nodes_out = nodes_df[nodes_df[node_id].isin(node_ids)].reset_index(drop=True) + # Collect hop labels from all edge steps + all_node_hops: Dict[Any, int] = {} + all_edge_hops: Dict[Any, int] = {} + for edge_step in edge_steps: + if "collected_node_hops" in edge_step: + for nid, hop in edge_step["collected_node_hops"].items(): + if nid not in all_node_hops or all_node_hops[nid] > hop: + all_node_hops[nid] = hop + if "collected_edge_hops" in edge_step: + for eid, hop in edge_step["collected_edge_hops"].items(): + if eid not in all_edge_hops or all_edge_hops[eid] > hop: + all_edge_hops[eid] = hop + + # Apply output slicing if specified + output_node_hops = dict(all_node_hops) + output_edge_hops = dict(all_edge_hops) + for edge_step in edge_steps: + out_min = edge_step.get("output_min_hops") + out_max = edge_step.get("output_max_hops") + if out_min is not None or out_max is not None: + # Filter node hops by output bounds + output_node_hops = { + nid: hop for nid, hop in output_node_hops.items() + if (out_min is None or hop >= out_min) and (out_max is None or hop <= out_max) + } + # Filter edge hops by output bounds + output_edge_hops = { + eid: hop for eid, hop in output_edge_hops.items() + if (out_min is None or hop >= out_min) and (out_max is None or hop <= out_max) + } + # Also filter collected_edges/collected_nodes for output + if "collected_edges" in edge_step: + edge_step["collected_edges"] = { + eid for eid in edge_step["collected_edges"] + if eid in output_edge_hops + } + if "collected_nodes" in edge_step: + # For node slicing, we need to look at the associated node_step + pass # Node filtering handled via output_node_hops + + has_output_slice = any( + edge_step.get("output_min_hops") is not None or edge_step.get("output_max_hops") is not None + for edge_step in edge_steps + ) + # First, collect edges edge_ids: Set[Any] = set() for edge_step in edge_steps: col = edge_step["id_col"] if col in paths: edge_ids.update(paths[col].tolist()) + if "collected_edges" in edge_step: + edge_ids.update(edge_step["collected_edges"]) + # If output slicing was applied, filter to edges within output bounds + if has_output_slice and output_edge_hops: + edge_ids = edge_ids & set(output_edge_hops.keys()) edges_out = edges_df[edges_df[edge_id].isin(edge_ids)].reset_index(drop=True) + # Collect nodes: include endpoints of kept edges (like GFQL does) + node_ids: Set[Any] = set() + for node_step in node_steps: + node_ids.update(paths[node_step["id_col"]].tolist()) + for edge_step in edge_steps: + if "collected_nodes" in edge_step: + node_ids.update(edge_step["collected_nodes"]) + + # If output slicing, nodes must be endpoints of kept edges (not just within hop range) + if has_output_slice and not edges_out.empty: + # Nodes that are endpoints of kept edges + edge_endpoint_nodes: Set[Any] = set() + edge_endpoint_nodes.update(edges_out[edge_src].tolist()) + edge_endpoint_nodes.update(edges_out[edge_dst].tolist()) + node_ids = node_ids & edge_endpoint_nodes + elif has_output_slice: + # No edges kept, so only nodes within output bounds + if output_node_hops: + node_ids = node_ids & set(output_node_hops.keys()) + nodes_out = nodes_df[nodes_df[node_id].isin(node_ids)].reset_index(drop=True) + tags = _build_tags(paths, node_steps, edge_steps) tags = {alias: set(values) for alias, values in tags.items()} path_bindings = _extract_paths(paths, node_steps, edge_steps) if include_paths else None - return OracleResult(nodes_out, edges_out, tags, path_bindings) + + # Return hop labels (use output-filtered versions if slicing was applied) + final_node_hops = output_node_hops if has_output_slice else all_node_hops + final_edge_hops = output_edge_hops if has_output_slice else all_edge_hops + + return OracleResult( + nodes_out, + edges_out, + tags, + path_bindings, + node_hop_labels=final_node_hops if final_node_hops else None, + edge_hop_labels=final_edge_hops if final_edge_hops else None, + ) def _coerce_ops(ops: Sequence[ASTObject]) -> List[ASTObject]: @@ -195,8 +313,17 @@ def _prepare_steps( } ) else: - if not isinstance(op, ASTEdge) or op.hops not in (None, 1): - raise ValueError("Enumerator only supports single-hop ASTEdge steps") + if not isinstance(op, ASTEdge): + raise ValueError("Chain must alternate ASTNode and ASTEdge") + if op.to_fixed_point: + raise ValueError("Enumerator does not support to_fixed_point edges") + # Normalize hop bounds (supports min/max hops for small graphs) + hop_min = op.min_hops if op.min_hops is not None else (op.hops if isinstance(op.hops, int) else 1) + hop_max = op.max_hops if op.max_hops is not None else (op.hops if isinstance(op.hops, int) else hop_min) + if hop_min is None or hop_max is None: + raise ValueError("Enumerator requires finite hop bounds") + if hop_min < 0 or hop_max < 0 or hop_min > hop_max: + raise ValueError(f"Invalid hop bounds min={hop_min}, max={hop_max}") edges.append( { "alias": op._name, @@ -206,6 +333,16 @@ def _prepare_steps( "id_col": f"edge_{len(edges)}", "src_col": f"edge_{len(edges)}_src", "dst_col": f"edge_{len(edges)}_dst", + "min_hops": hop_min, + "max_hops": hop_max, + # New hop label/slice params + "output_min_hops": op.output_min_hops, + "output_max_hops": op.output_max_hops, + "label_node_hops": op.label_node_hops, + "label_edge_hops": op.label_edge_hops, + "label_seeds": getattr(op, 'label_seeds', False), + "source_node_match": op.source_node_match, + "destination_node_match": op.destination_node_match, } ) expect_node = not expect_node @@ -359,7 +496,7 @@ def _build_edge_frame( def _apply_where(paths: pd.DataFrame, where: Sequence[WhereComparison]) -> pd.Series: - mask = pd.Series(True, index=paths.index) + mask: pd.Series = pd.Series(True, index=paths.index, dtype=bool) for clause in where: left_key = _alias_key(clause.left.alias, clause.left.column) right_key = _alias_key(clause.right.alias, clause.right.column) @@ -372,7 +509,8 @@ def _apply_where(paths: pd.DataFrame, where: Sequence[WhereComparison]) -> pd.Se result = _compare(left, right, clause.op) except Exception: result = pd.Series(False, index=paths.index) - mask &= valid & result.fillna(False) + result_bool = result.fillna(False).astype(bool) + mask &= valid & result_bool return mask @@ -392,6 +530,105 @@ def _compare(lhs: pd.Series, rhs: pd.Series, op: ComparisonOp) -> pd.Series: raise ValueError(f"Unsupported comparison operator '{op}'") +@dataclass +class BoundedPathResult: + """Result from bounded path enumeration with hop tracking.""" + seed_to_nodes: Dict[Any, Set[Any]] + edges_used: Set[Any] + nodes_used: Set[Any] + # Hop labels: entity_id -> minimum hop distance from any seed + node_hops: Dict[Any, int] + edge_hops: Dict[Any, int] + + +def _bounded_paths( + seeds: Sequence[Any], + edges_df: pd.DataFrame, + step: Dict[str, Any], + dest_allowed: Optional[Set[Any]], + caps: OracleCaps, +) -> BoundedPathResult: + """ + Enumerate bounded-hop paths for a single Edge step (direction already normalized in edges_df). + Returns BoundedPathResult with reachable nodes, edges used, and hop labels. + """ + src_col, dst_col, edge_id_col = step["src_col"], step["dst_col"], step["id_col"] + min_hops, max_hops = step["min_hops"], step["max_hops"] + label_seeds = step.get("label_seeds", False) + + adjacency: Dict[Any, List[Tuple[Any, Any]]] = {} + for _, row in edges_df.iterrows(): + adjacency.setdefault(row[src_col], []).append((row[edge_id_col], row[dst_col])) + + seed_to_nodes: Dict[Any, Set[Any]] = {} + edges_used: Set[Any] = set() + nodes_used: Set[Any] = set() + # Track minimum hop distance for each node/edge + node_hops: Dict[Any, int] = {} + edge_hops: Dict[Any, int] = {} + + for seed in seeds: + # Phase 1: Explore all paths and find valid destinations (reachable within [min_hops, max_hops]) + # Also collect ALL paths to ALL nodes (will filter in phase 2) + all_paths: List[Tuple[Any, List[Any], List[Any]]] = [] # (destination, edge_ids, node_ids) + valid_destinations: Set[Any] = set() + + stack: List[Tuple[Any, int, List[Any], List[Any]]] = [(seed, 0, [], [seed])] + while stack: + node, depth, path_edges, path_nodes = stack.pop() + if depth >= max_hops: + continue + for edge_id, dst in adjacency.get(node, []): + new_depth = depth + 1 + new_path = path_edges + [edge_id] + new_nodes = path_nodes + [dst] + + # Save every path + all_paths.append((dst, list(new_path), list(new_nodes))) + + if new_depth >= min_hops: + if dest_allowed is None or dst in dest_allowed: + valid_destinations.add(dst) + seed_to_nodes.setdefault(seed, set()).add(dst) + + if new_depth < max_hops: + stack.append((dst, new_depth, new_path, new_nodes)) + + # Phase 2: Include nodes/edges from paths that lead to valid destinations + if valid_destinations: + # Include seed in output since we have valid paths + nodes_used.add(seed) + if label_seeds and seed not in node_hops: + node_hops[seed] = 0 + + for dst, path_edges, path_nodes in all_paths: + if dst in valid_destinations: + edges_used.update(path_edges) + nodes_used.update(path_nodes) + # Track hop distances + for i, eid in enumerate(path_edges): + hop_dist = i + 1 + if eid not in edge_hops or edge_hops[eid] > hop_dist: + edge_hops[eid] = hop_dist + for i, nid in enumerate(path_nodes): + hop_dist = i + if hop_dist == 0 and not label_seeds: + continue + if nid not in node_hops or node_hops[nid] > hop_dist: + node_hops[nid] = hop_dist + + if len(edges_used) > caps.max_edges or len(nodes_used) > caps.max_nodes: + raise ValueError("Enumerator caps exceeded during bounded hop traversal") + + return BoundedPathResult( + seed_to_nodes=seed_to_nodes, + edges_used=edges_used, + nodes_used=nodes_used, + node_hops=node_hops, + edge_hops=edge_hops, + ) + + def _build_tags( paths: pd.DataFrame, node_steps: Sequence[Dict[str, Any]], diff --git a/graphistry/models/gfql/types/call.py b/graphistry/models/gfql/types/call.py index e490eb3c68..9a38e0989d 100644 --- a/graphistry/models/gfql/types/call.py +++ b/graphistry/models/gfql/types/call.py @@ -86,6 +86,13 @@ class HopParams(TypedDict, total=False): """Parameters for hop() traversal operation.""" nodes: Any hops: int + min_hops: int + max_hops: int + label_node_hops: Optional[str] + label_edge_hops: Optional[str] + label_seeds: bool + output_min_hops: int + output_max_hops: int to_fixed_point: bool direction: Literal['forward', 'reverse', 'undirected'] source_node_match: Dict[str, Any] diff --git a/graphistry/plugins/kusto.py b/graphistry/plugins/kusto.py index 9c7e0974d9..1293bcf79f 100644 --- a/graphistry/plugins/kusto.py +++ b/graphistry/plugins/kusto.py @@ -456,7 +456,6 @@ def _unwrap_nested(result: "KustoQueryResult") -> pd.DataFrame: if df[col].isna().all(): df = df.drop(columns=[col]) - # Replace pandas/NA with Python None for consistency without type ignores df = df.astype(object) df = df.where(pd.notna(df)) df = df.fillna(value=None) diff --git a/graphistry/tests/test_compute_hops.py b/graphistry/tests/test_compute_hops.py index 4f9590d9f7..4eb323b62c 100644 --- a/graphistry/tests/test_compute_hops.py +++ b/graphistry/tests/test_compute_hops.py @@ -1,4 +1,6 @@ import pandas as pd +import pytest +import graphistry from common import NoAuthTestCase from functools import lru_cache @@ -49,6 +51,42 @@ def hops_graph() -> CGFull: return CGFull().nodes(nodes_df, 'node').edges(edges_df, 's', 'd') + +def simple_chain_graph() -> CGFull: + nodes_df = pd.DataFrame([ + {'node': 'a'}, + {'node': 'b'}, + {'node': 'c'}, + {'node': 'd'}, + ]) + edges_df = pd.DataFrame([ + {'s': 'a', 'd': 'b'}, + {'s': 'b', 'd': 'c'}, + {'s': 'c', 'd': 'd'}, + ]) + return CGFull().nodes(nodes_df, 'node').edges(edges_df, 's', 'd') + + +def branching_chain_graph() -> CGFull: + nodes_df = pd.DataFrame([ + {'node': 'a'}, + {'node': 'b1'}, + {'node': 'c1'}, + {'node': 'd1'}, + {'node': 'e1'}, + {'node': 'b2'}, + {'node': 'c2'}, + ]) + edges_df = pd.DataFrame([ + {'s': 'a', 'd': 'b1'}, + {'s': 'b1', 'd': 'c1'}, + {'s': 'c1', 'd': 'd1'}, + {'s': 'd1', 'd': 'e1'}, + {'s': 'a', 'd': 'b2'}, + {'s': 'b2', 'd': 'c2'}, + ]) + return CGFull().nodes(nodes_df, 'node').edges(edges_df, 's', 'd') + class TestComputeHopMixin(NoAuthTestCase): @@ -184,6 +222,798 @@ def test_predicate_is_in(self): g = hops_graph() assert g.hop(source_node_match={'node': is_in(['e', 'k'])})._edges.shape == (3, 3) + def test_hop_min_max_range(self): + g = simple_chain_graph() + seeds = pd.DataFrame({g._node: ['a']}) + g2 = g.hop(seeds, min_hops=2, max_hops=3) + assert set(g2._nodes[g2._node].to_list()) == {'a', 'b', 'c', 'd'} + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('a', 'b'), ('b', 'c'), ('c', 'd')} + + def test_hop_min_not_reached_returns_empty(self): + edges = pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}) + g = graphistry.edges(edges, 's', 'd').nodes(pd.DataFrame({'id': ['a', 'b', 'c']}), 'id') + seeds = pd.DataFrame({g._node: ['a']}) + g2 = g.hop(seeds, min_hops=4, max_hops=4) + assert g2._nodes.empty + assert g2._edges.empty + + def test_hop_exact_three_branch(self): + """Test that min_hops=max_hops=3 prunes branches that don't reach 3 hops. + + On branching graph with paths a->b1->c1->d1->e1 (4 hops) and a->b2->c2 (2 hops), + requesting exactly 3 hops should return only paths that reach 3 hops. + The b2/c2 branch should be excluded since it only reaches 2 hops. + """ + g = branching_chain_graph() + seeds = pd.DataFrame({g._node: ['a']}) + g2 = g.hop(seeds, min_hops=3, max_hops=3) + # Only nodes/edges on paths reaching 3 hops; b2/c2 branch excluded + assert set(g2._nodes[g._node].to_list()) == {'a', 'b1', 'c1', 'd1'} + assert set(zip(g2._edges['s'], g2._edges['d'])) == { + ('a', 'b1'), + ('b1', 'c1'), + ('c1', 'd1'), + } + + def test_hop_labels_nodes_edges(self): + g = simple_chain_graph() + seeds = pd.DataFrame({g._node: ['a']}) + g2 = g.hop(seeds, min_hops=1, max_hops=3, label_node_hops='hop', label_edge_hops='edge_hop', label_seeds=True) + node_hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert node_hops == {'a': 0, 'b': 1, 'c': 2, 'd': 3} + edge_hops = {(row['s'], row['d'], row['edge_hop']) for _, row in g2._edges.iterrows()} + assert edge_hops == {('a', 'b', 1), ('b', 'c', 2), ('c', 'd', 3)} + + def test_hop_slice_labels_seed_zero(self): + g = simple_chain_graph() + seeds = pd.DataFrame({g._node: ['a']}) + g2 = g.hop( + seeds, + min_hops=2, + max_hops=4, + output_min_hops=3, + output_max_hops=4, + label_node_hops='hop', + label_edge_hops='edge_hop', + label_seeds=True, + return_as_wave_front=False + ) + node_hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert node_hops.get('a') == 0 # seeds kept and labeled when label_seeds=True + assert node_hops.get('d') == 3 + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('c', 'd')} + + def test_hop_labels_seed_toggle(self): + g = simple_chain_graph() + seeds = pd.DataFrame({g._node: ['a']}) + g_no_seed = g.hop(seeds, min_hops=1, max_hops=2, label_node_hops='hop', label_edge_hops='edge_hop', label_seeds=False) + node_hops_no_seed = dict(zip(g_no_seed._nodes[g._node], g_no_seed._nodes['hop'])) + assert pd.isna(node_hops_no_seed.get('a')) + assert {k: v for k, v in node_hops_no_seed.items() if pd.notna(v)} == {'b': 1, 'c': 2} + assert str(g_no_seed._nodes['hop'].dtype) == 'Int64' + edge_hops_no_seed = {(row['s'], row['d'], row['edge_hop']) for _, row in g_no_seed._edges.iterrows()} + assert edge_hops_no_seed == {('a', 'b', 1), ('b', 'c', 2)} + + g_with_seed = g.hop(seeds, min_hops=1, max_hops=2, label_node_hops='hop', label_edge_hops='edge_hop', label_seeds=True) + node_hops_with_seed = dict(zip(g_with_seed._nodes[g._node], g_with_seed._nodes['hop'])) + assert node_hops_with_seed == {'a': 0, 'b': 1, 'c': 2} + edge_hops_with_seed = {(row['s'], row['d'], row['edge_hop']) for _, row in g_with_seed._edges.iterrows()} + assert edge_hops_with_seed == {('a', 'b', 1), ('b', 'c', 2)} + + def test_hop_output_slice(self): + g = simple_chain_graph() + seeds = pd.DataFrame({g._node: ['a']}) + g2 = g.hop(seeds, min_hops=2, max_hops=2, label_node_hops='hop', label_edge_hops='edge_hop') + assert set(g2._nodes[g._node].to_list()) == {'a', 'b', 'c'} + assert set(g2._nodes['hop'].dropna().to_list()) == {1, 2} + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('a', 'b'), ('b', 'c')} + assert set(g2._edges['edge_hop'].to_list()) == {1, 2} + + def test_hop_output_slice_below_min_keeps_path(self): + g = simple_chain_graph() + seeds = pd.DataFrame({g._node: ['a']}) + g2 = g.hop( + seeds, + min_hops=3, + max_hops=3, + output_min_hops=1, + label_node_hops='hop', + label_edge_hops='edge_hop', + label_seeds=True + ) + node_hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert node_hops == {'a': 0, 'b': 1, 'c': 2, 'd': 3} + edge_hops = {(row['s'], row['d'], row['edge_hop']) for _, row in g2._edges.iterrows()} + assert edge_hops == {('a', 'b', 1), ('b', 'c', 2), ('c', 'd', 3)} + + def test_hop_output_slice_range(self): + g = branching_chain_graph() + seeds = pd.DataFrame({g._node: ['a']}) + g2 = g.hop( + seeds, + min_hops=2, + max_hops=4, + output_min_hops=3, + output_max_hops=4, + label_node_hops='hop', + label_edge_hops='edge_hop' + ) + assert set(g2._nodes[g._node].to_list()) == {'c1', 'd1', 'e1'} + assert set(g2._nodes.dropna(subset=['hop'])['hop'].to_list()) == {2, 3, 4} + assert set(zip(g2._edges['s'], g2._edges['d'], g2._edges['edge_hop'])) == { + ('c1', 'd1', 3), + ('d1', 'e1', 4) + } + + def test_hop_output_slice_min_above_max_raises(self): + g = simple_chain_graph() + seeds = pd.DataFrame({g._node: ['a']}) + with pytest.raises(ValueError, match='output_min_hops .* cannot exceed max_hops'): + g.hop(seeds, min_hops=2, max_hops=3, output_min_hops=4) + + def test_hop_output_slice_max_below_min_raises(self): + g = simple_chain_graph() + seeds = pd.DataFrame({g._node: ['a']}) + with pytest.raises(ValueError, match='output_max_hops .* cannot be below min_hops'): + g.hop(seeds, min_hops=2, max_hops=3, output_max_hops=1) + + def test_hop_output_slice_max_above_traversal_allowed(self): + g = simple_chain_graph() + seeds = pd.DataFrame({g._node: ['a']}) + g2 = g.hop(seeds, min_hops=2, max_hops=2, output_max_hops=5, label_edge_hops='edge_hop') + # Output cap respects traversal; no extra hops are produced + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('a', 'b'), ('b', 'c')} + assert set(g2._edges['edge_hop']) == {1, 2} + + def test_hop_output_slice_without_labels(self): + g = branching_chain_graph() + seeds = pd.DataFrame({g._node: ['a']}) + g2 = g.hop( + seeds, + min_hops=2, + max_hops=3, + output_min_hops=3, + output_max_hops=3 + ) + # Output slice applies even without explicit labels; label columns are dropped + assert set(g2._nodes[g._node].to_list()) == {'c1', 'd1'} + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('c1', 'd1')} + assert 'hop' not in g2._nodes.columns + assert 'edge_hop' not in g2._edges.columns + + def test_hop_cycle_min_gt_one(self): + # Cycle a->b->c->a; ensure min>1 does not loop infinitely and labels stick to earliest hop + edges = pd.DataFrame({'s': ['a', 'b', 'c'], 'd': ['b', 'c', 'a']}) + g = graphistry.edges(edges, 's', 'd').nodes(pd.DataFrame({'id': ['a', 'b', 'c']}), 'id') + seeds = pd.DataFrame({g._node: ['a']}) + g2 = g.hop(seeds, min_hops=2, max_hops=3, label_node_hops='hop', label_edge_hops='edge_hop') + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('a', 'b'), ('b', 'c'), ('c', 'a')} + node_hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert node_hops['a'] == 3 # first return to seed at hop 3 + assert node_hops['b'] == 1 and node_hops['c'] == 2 + assert set(g2._edges['edge_hop']) == {1, 2, 3} + + def test_hop_undirected_min_gt_one(self): + edges = pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}) + g = graphistry.edges(edges, 's', 'd').nodes(pd.DataFrame({'id': ['a', 'b', 'c']}), 'id') + seeds = pd.DataFrame({g._node: ['a']}) + g2 = g.hop(seeds, direction='undirected', min_hops=2, max_hops=3, label_node_hops='hop', label_edge_hops='edge_hop') + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('a', 'b'), ('b', 'c')} + assert set(g2._edges['edge_hop']) == {1, 2} + node_hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert node_hops.get('b') == 1 and node_hops.get('c') == 2 + + def test_hop_label_collision_suffix(self): + # Existing hop column should be preserved; new label suffixes + g = simple_chain_graph() + seeds = pd.DataFrame({g._node: ['a']}) + g_existing = g.nodes(g._nodes.assign(hop='keep_me')) + g2 = g_existing.hop(seeds, min_hops=1, max_hops=2, label_node_hops='hop', label_edge_hops='hop') + assert 'hop' in g2._nodes.columns and 'hop_1' in g2._nodes.columns + assert set(g2._edges.columns) & {'hop', 'hop_1'} == {'hop'} # edges only suffix once + assert 'keep_me' in set(g2._nodes['hop']) + + def test_hop_seed_labels(self): + g = simple_chain_graph() + seeds = pd.DataFrame({g._node: ['a']}) + g2 = g.hop(seeds, min_hops=1, max_hops=3, label_node_hops='hop', label_seeds=True) + node_hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert node_hops['a'] == 0 and node_hops['b'] == 1 and node_hops['c'] == 2 and node_hops['d'] == 3 + + def test_hop_call_path_new_params(self): + g = simple_chain_graph() + seeds = pd.DataFrame({g._node: ['a']}) + payload = {'type': 'Call', 'function': 'hop', 'params': { + 'nodes': seeds, + 'min_hops': 1, + 'max_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop' + }} + g2 = g.gfql([payload]) + assert set(g2._nodes['hop'].dropna()) == {1, 2} + assert set(g2._edges['edge_hop']) == {1, 2} + + def test_gfql_edge_forward_min_max_labels(self): + g = simple_chain_graph() + seeds = ['a'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'min_hops': 1, + 'max_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + 'label_seeds': True, + }, + ] + g2 = g.gfql(chain) + node_hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert node_hops == {'a': 0, 'b': 1, 'c': 2} + edge_hops = {(row['s'], row['d'], row['edge_hop']) for _, row in g2._edges.iterrows()} + assert edge_hops == {('a', 'b', 1), ('b', 'c', 2)} + + def test_gfql_edge_forward_output_slice(self): + g = simple_chain_graph() + seeds = ['a'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'min_hops': 1, + 'max_hops': 3, + 'output_min_hops': 2, + 'label_node_hops': 'hop', + }, + ] + g2 = g.gfql(chain) + hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + labeled = {k: v for k, v in hops.items() if pd.notna(v)} + assert labeled == {'c': 2, 'd': 3} + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('b', 'c'), ('c', 'd')} + + def test_gfql_edge_forward_edge_query(self): + e_df = pd.DataFrame({ + 's': ['a', 'a', 'd', 'd', 'f', 'f'], + 'd': ['b', 'b', 'e', 'e', 'g', 'g'], + 't': ['x', 'h', 'x', 'h', 'x', 'h'] + }) + n_df = pd.DataFrame({ + 'n': ['a', 'b', 'd', 'e', 'f', 'g'], + 't': ['x', 'm', 'x', 'n', 'x', 'o'] + }) + g = CGFull().edges(e_df, 's', 'd').nodes(n_df, 'n') + chain = [ + {'type': 'Node', 'filter_dict': {'n': is_in(['a'])}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'min_hops': 1, + 'max_hops': 1, + 'edge_query': 't == "h"', + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + 'label_seeds': True, + }, + ] + g2 = g.gfql(chain) + node_hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert node_hops == {'a': 0, 'b': 1} + assert set(zip(g2._edges['s'], g2._edges['d'], g2._edges['edge_hop'])) == {('a', 'b', 1)} + + def test_gfql_edge_reverse_min_max_labels(self): + g = simple_chain_graph() + seeds = ['c'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'reverse', + 'min_hops': 1, + 'max_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + 'label_seeds': True, + }, + ] + g2 = g.gfql(chain) + node_hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert node_hops == {'a': 2, 'b': 1, 'c': 0} + assert set(g2._edges['edge_hop']) == {1, 2} + + def test_gfql_edge_reverse_output_slice(self): + g = simple_chain_graph() + seeds = ['c'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'reverse', + 'min_hops': 1, + 'max_hops': 3, + 'output_min_hops': 2, + 'output_max_hops': 3, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + }, + ] + g2 = g.gfql(chain) + hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + labeled = {k: v for k, v in hops.items() if pd.notna(v)} + assert labeled == {'a': 2} + assert set(zip(g2._edges['s'], g2._edges['d'], g2._edges['edge_hop'])) == {('a', 'b', 2)} + # endpoints of kept edges are present + assert set(g2._edges[g._source]).union(set(g2._edges[g._destination])) <= set(g2._nodes[g._node]) + + def test_gfql_edge_reverse_edge_query(self): + e_df = pd.DataFrame({ + 's': ['a', 'a', 'd', 'd', 'f', 'f'], + 'd': ['b', 'b', 'e', 'e', 'g', 'g'], + 't': ['x', 'h', 'x', 'h', 'x', 'h'] + }) + n_df = pd.DataFrame({ + 'n': ['a', 'b', 'd', 'e', 'f', 'g'], + 't': ['x', 'm', 'x', 'n', 'x', 'o'] + }) + g = CGFull().edges(e_df, 's', 'd').nodes(n_df, 'n') + chain = [ + {'type': 'Node', 'filter_dict': {'n': is_in(['e'])}}, + { + 'type': 'Edge', + 'direction': 'reverse', + 'min_hops': 1, + 'max_hops': 1, + 'edge_query': 't == "h"', + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + 'label_seeds': True, + }, + ] + g2 = g.gfql(chain) + node_hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert node_hops == {'d': 1, 'e': 0} + assert set(zip(g2._edges['s'], g2._edges['d'], g2._edges['edge_hop'])) == {('d', 'e', 1)} + + def test_gfql_edge_undirected_labels(self): + g = simple_chain_graph() + seeds = ['a'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'undirected', + 'min_hops': 1, + 'max_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + 'label_seeds': True, + }, + ] + g2 = g.gfql(chain) + node_hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert node_hops == {'a': 0, 'b': 1, 'c': 2} + assert set(g2._edges['edge_hop']) == {1, 2} + + def test_gfql_edge_undirected_output_slice(self): + g = simple_chain_graph() + seeds = ['a'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'undirected', + 'min_hops': 1, + 'max_hops': 3, + 'output_min_hops': 2, + 'output_max_hops': 3, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + }, + ] + g2 = g.gfql(chain) + hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + labeled = {k: v for k, v in hops.items() if pd.notna(v)} + assert labeled == {'c': 2, 'd': 3} + assert set(zip(g2._edges[g._source], g2._edges[g._destination], g2._edges['edge_hop'])) == {('b', 'c', 2), ('c', 'd', 3)} + assert set(g2._edges[g._source]).union(set(g2._edges[g._destination])) <= set(g2._nodes[g._node]) + + def test_gfql_chain_forward_reverse_slice_monotonic(self): + g = simple_chain_graph() + seeds = ['a'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'min_hops': 1, + 'max_hops': 3, + 'output_min_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + }, + {'type': 'Edge', 'direction': 'reverse', 'min_hops': 1, 'max_hops': 1}, + ] + g2 = g.gfql(chain) + # Should keep only the sliced hops and endpoints after reverse pass + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('b', 'c'), ('c', 'd')} + assert set(g2._edges['s']).union(set(g2._edges['d'])) <= set(g2._nodes[g._node]) + + def test_gfql_chain_forward_undirected_slice_monotonic(self): + g = simple_chain_graph() + seeds = ['a'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'min_hops': 1, + 'max_hops': 3, + 'output_min_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + }, + {'type': 'Edge', 'direction': 'undirected', 'min_hops': 1, 'max_hops': 1}, + ] + g2 = g.gfql(chain) + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('b', 'c'), ('c', 'd')} + assert set(g2._edges['s']).union(set(g2._edges['d'])) <= set(g2._nodes[g._node]) + + def test_gfql_edge_output_slice_without_labels(self): + g = branching_chain_graph() + seeds = ['a'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'min_hops': 2, + 'max_hops': 3, + 'output_min_hops': 3, + 'output_max_hops': 3 + }, + ] + g2 = g.gfql(chain) + assert set(g2._nodes[g._node]) == {'c1', 'd1'} + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('c1', 'd1')} + + def test_gfql_edge_output_slice_with_filters(self): + e_df = pd.DataFrame({ + 's': ['a', 'a', 'a'], + 'd': ['b', 'c', 'd'], + 't': [1, 2, 3] + }) + n_df = pd.DataFrame({'id': ['a', 'b', 'c', 'd']}) + g = CGFull().edges(e_df, 's', 'd').nodes(n_df, 'id') + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(['a'])}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'min_hops': 1, + 'max_hops': 2, + 'output_min_hops': 2, + 'edge_query': 't >= 2', + 'label_edge_hops': 'edge_hop', + 'label_node_hops': 'hop', + }, + ] + g2 = g.gfql(chain) + # Filters remove hop-1 edges; slice leaves nothing reachable at hop 2 + hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert {k: v for k, v in hops.items() if pd.notna(v)} == {} + assert g2._edges.empty + + def test_gfql_edge_slice_without_labels_auto(self): + g = branching_chain_graph() + seeds = ['a'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'min_hops': 2, + 'max_hops': 4, + 'output_min_hops': 3, + 'output_max_hops': 4, + }, + ] + g2 = g.gfql(chain) + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('c1', 'd1'), ('d1', 'e1')} + assert set(g2._edges['s']).union(set(g2._edges['d'])) <= set(g2._nodes[g._node]) + + def test_gfql_edge_slice_with_seed_labels_off(self): + g = simple_chain_graph() + seeds = ['a'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'min_hops': 1, + 'max_hops': 3, + 'output_min_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + 'label_seeds': False, + }, + ] + g2 = g.gfql(chain) + hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert pd.isna(hops.get('a')) + assert {k: v for k, v in hops.items() if pd.notna(v)} == {'c': 2, 'd': 3} + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('b', 'c'), ('c', 'd')} + + def test_gfql_edge_slice_with_seed_labels_on(self): + g = simple_chain_graph() + seeds = ['a'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'min_hops': 1, + 'max_hops': 3, + 'output_min_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + 'label_seeds': True, + }, + ] + g2 = g.gfql(chain) + hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert {k: v for k, v in hops.items() if pd.notna(v)} == {'a': 0, 'c': 2, 'd': 3} + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('b', 'c'), ('c', 'd')} + + def test_gfql_edge_exact_branching(self): + """Test that min_hops=max_hops=3 prunes branches that don't reach 3 hops. + + On a branching graph with paths a->b1->c1->d1->e1 (4 hops) and a->b2->c2 (2 hops), + requesting exactly 3 hops should return only the edges/nodes on paths that reach 3 hops. + The b2/c2 branch (only 2 hops from a) should be excluded. + """ + g = branching_chain_graph() + seeds = ['a'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'min_hops': 3, + 'max_hops': 3, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + }, + ] + g2 = g.gfql(chain) + # Only edges on the 3-hop path should be included; b2/c2 branch excluded + assert set(zip(g2._edges['s'], g2._edges['d'])) == { + ('a', 'b1'), + ('b1', 'c1'), + ('c1', 'd1'), + } + # Only nodes on the 3-hop path should be included + assert set(g2._nodes[g._node]) == {'a', 'b1', 'c1', 'd1'} + hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert hops.get('d1') == 3 + + def test_gfql_edge_fixed_point_with_slice(self): + g = simple_chain_graph() + seeds = ['a'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'to_fixed_point': True, + 'output_min_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + 'label_seeds': True, + }, + ] + g2 = g.gfql(chain) + hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert {k: v for k, v in hops.items() if pd.notna(v)} == {'a': 0, 'c': 2, 'd': 3} + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('b', 'c'), ('c', 'd')} + + def test_gfql_edge_reverse_slice_with_filters(self): + e_df = pd.DataFrame({ + 's': ['a', 'b'], + 'd': ['b', 'c'], + 't': [2, 2] + }) + n_df = pd.DataFrame({'id': ['a', 'b', 'c']}) + g = CGFull().edges(e_df, 's', 'd').nodes(n_df, 'id') + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(['c'])}}, + { + 'type': 'Edge', + 'direction': 'reverse', + 'min_hops': 1, + 'max_hops': 2, + 'output_min_hops': 2, + 'edge_query': 't == 2', + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + }, + ] + g2 = g.gfql(chain) + hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert {k: v for k, v in hops.items() if pd.notna(v)} == {'a': 2} + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('a', 'b')} + + def test_gfql_edge_undirected_slice_seed_labels_toggle(self): + g = simple_chain_graph() + seeds = ['a'] + chain_off = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'undirected', + 'min_hops': 1, + 'max_hops': 3, + 'output_min_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + 'label_seeds': False, + }, + ] + g_off = g.gfql(chain_off) + hops_off = dict(zip(g_off._nodes[g._node], g_off._nodes['hop'])) + assert pd.isna(hops_off.get('a')) + + chain_on = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'undirected', + 'min_hops': 1, + 'max_hops': 3, + 'output_min_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + 'label_seeds': True, + }, + ] + g_on = g.gfql(chain_on) + hops_on = dict(zip(g_on._nodes[g._node], g_on._nodes['hop'])) + assert hops_on.get('a') == 0 + + def test_gfql_edge_slice_multiseed_labels_toggle(self): + e_df = pd.DataFrame({ + 's': ['a', 'b', 'x', 'y'], + 'd': ['b', 'c', 'y', 'z'], + }) + n_df = pd.DataFrame({'id': ['a', 'b', 'c', 'x', 'y', 'z']}) + g = CGFull().edges(e_df, 's', 'd').nodes(n_df, 'id') + seeds = ['a', 'x'] + + chain_off = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'min_hops': 1, + 'max_hops': 2, + 'output_min_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + 'label_seeds': False, + }, + ] + g_off = g.gfql(chain_off) + hops_off = dict(zip(g_off._nodes[g._node], g_off._nodes['hop'])) + assert pd.isna(hops_off.get('a')) and pd.isna(hops_off.get('x')) + assert {k: v for k, v in hops_off.items() if pd.notna(v)} == {'c': 2, 'z': 2} + assert set(zip(g_off._edges['s'], g_off._edges['d'])) == {('b', 'c'), ('y', 'z')} + + chain_on = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'min_hops': 1, + 'max_hops': 2, + 'output_min_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + 'label_seeds': True, + }, + ] + g_on = g.gfql(chain_on) + hops_on = dict(zip(g_on._nodes[g._node], g_on._nodes['hop'])) + assert hops_on.get('a') == 0 and hops_on.get('x') == 0 + assert {k: v for k, v in hops_on.items() if pd.notna(v)} == {'a': 0, 'x': 0, 'c': 2, 'z': 2} + assert set(zip(g_on._edges['s'], g_on._edges['d'])) == {('b', 'c'), ('y', 'z')} + + def test_gfql_edge_reverse_slice_seed_labels_toggle(self): + g = simple_chain_graph() + seeds = ['d'] + chain_off = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'reverse', + 'min_hops': 1, + 'max_hops': 3, + 'output_min_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + 'label_seeds': False, + }, + ] + g_off = g.gfql(chain_off) + hops_off = dict(zip(g_off._nodes[g._node], g_off._nodes['hop'])) + assert pd.isna(hops_off.get('d')) + assert {k: v for k, v in hops_off.items() if pd.notna(v)} == {'b': 2, 'a': 3} + assert set(zip(g_off._edges['s'], g_off._edges['d'])) == {('a', 'b'), ('b', 'c')} + + chain_on = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'reverse', + 'min_hops': 1, + 'max_hops': 3, + 'output_min_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + 'label_seeds': True, + }, + ] + g_on = g.gfql(chain_on) + hops_on = dict(zip(g_on._nodes[g._node], g_on._nodes['hop'])) + assert hops_on.get('d') == 0 + assert {k: v for k, v in hops_on.items() if pd.notna(v)} == {'d': 0, 'b': 2, 'a': 3} + assert set(zip(g_on._edges['s'], g_on._edges['d'])) == {('a', 'b'), ('b', 'c')} + + def test_gfql_edge_fixed_point_output_max(self): + g = simple_chain_graph() + seeds = ['a'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'to_fixed_point': True, + 'output_max_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + 'label_seeds': True, + }, + ] + g2 = g.gfql(chain) + hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + assert {k: v for k, v in hops.items() if pd.notna(v)} == {'a': 0, 'b': 1, 'c': 2} + assert set(zip(g2._edges['s'], g2._edges['d'], g2._edges['edge_hop'])) == {('a', 'b', 1), ('b', 'c', 2)} + def test_gfql_chain_output_slice_monotonic_paths(self): + g = simple_chain_graph() + seeds = ['a'] + chain = [ + {'type': 'Node', 'filter_dict': {g._node: is_in(seeds)}}, + { + 'type': 'Edge', + 'direction': 'forward', + 'min_hops': 1, + 'max_hops': 3, + 'output_min_hops': 2, + 'label_node_hops': 'hop', + 'label_edge_hops': 'edge_hop', + }, + {'type': 'Node'}, # keep reachable slice after traversal/slicing + ] + g2 = g.gfql(chain) + hops = dict(zip(g2._nodes[g._node], g2._nodes['hop'])) + labeled = {k: v for k, v in hops.items() if pd.notna(v)} + assert labeled == {'c': 2, 'd': 3} + # Edges should only be from the sliced hops; earlier hop edges excluded + assert set(zip(g2._edges['s'], g2._edges['d'])) == {('b', 'c'), ('c', 'd')} + # Edge endpoints remain present in nodes + assert set(g2._edges['s']).union(set(g2._edges['d'])) <= set(g2._nodes[g._node]) + class TestComputeHopMixinQuery(NoAuthTestCase): def test_hop_source_query(self): diff --git a/mypy.ini b/mypy.ini index 6617b6864f..d3c38b0b90 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,5 +1,5 @@ [mypy] - +# python_version intentionally unset: rely on interpreter version (e.g., 3.12 runners) # TODO check tests exclude = graph_vector_pb2|versioneer|_version|graphistry/tests @@ -117,3 +117,9 @@ ignore_missing_imports = True [mypy-requests.*] ignore_missing_imports = True + +[mypy-urllib3.*] +ignore_missing_imports = True + +[mypy-packaging.*] +ignore_missing_imports = True diff --git a/tests/gfql/ref/test_enumerator_parity.py b/tests/gfql/ref/test_enumerator_parity.py index e5bee253a5..59d76ee75b 100644 --- a/tests/gfql/ref/test_enumerator_parity.py +++ b/tests/gfql/ref/test_enumerator_parity.py @@ -19,7 +19,7 @@ def _alias_bindings(df, id_col, alias): return set(df.loc[df[alias].astype(bool), id_col]) -def _run_parity_case(nodes, edges, ops): +def _run_parity_case(nodes, edges, ops, check_hop_labels=False): g = ( CGFull() .nodes(pd.DataFrame(nodes), "id") @@ -48,6 +48,44 @@ def _run_parity_case(nodes, edges, ops): elif isinstance(op, ASTEdge): assert oracle.tags.get(alias, set()) == _alias_bindings(gfql_edges, g._edge, alias) + # Check hop labels if requested + if check_hop_labels: + for op in ops: + if not isinstance(op, ASTEdge): + continue + node_hop_col = getattr(op, 'label_node_hops', None) + edge_hop_col = getattr(op, 'label_edge_hops', None) + label_seeds = getattr(op, 'label_seeds', False) + + if node_hop_col and gfql_nodes is not None and node_hop_col in gfql_nodes.columns: + # Compare node hop labels + gfql_node_hops = { + row[g._node]: int(row[node_hop_col]) + for _, row in gfql_nodes.iterrows() + if pd.notna(row[node_hop_col]) + } + oracle_node_hops = oracle.node_hop_labels or {} + # Oracle should match GFQL for non-seed nodes + for nid, hop in gfql_node_hops.items(): + if hop == 0 and not label_seeds: + continue # Skip seeds when label_seeds=False + assert nid in oracle_node_hops, f"Node {nid} with hop {hop} not in oracle" + assert oracle_node_hops[nid] == hop, f"Node {nid}: oracle hop {oracle_node_hops[nid]} != gfql hop {hop}" + + if edge_hop_col and gfql_edges is not None and edge_hop_col in gfql_edges.columns: + # Compare edge hop labels + gfql_edge_hops = { + row[g._edge]: int(row[edge_hop_col]) + for _, row in gfql_edges.iterrows() + if pd.notna(row[edge_hop_col]) + } + oracle_edge_hops = oracle.edge_hop_labels or {} + for eid, hop in gfql_edge_hops.items(): + assert eid in oracle_edge_hops, f"Edge {eid} with hop {hop} not in oracle" + assert oracle_edge_hops[eid] == hop, f"Edge {eid}: oracle hop {oracle_edge_hops[eid]} != gfql hop {hop}" + + return oracle # Return for additional assertions in specific tests + CASES = [ ( @@ -153,9 +191,359 @@ def _run_parity_case(nodes, edges, ops): ], [n({"type": "account"}, name="root"), e_forward({"type": "txn"}, name="first_hop"), n({"type": "account"}, name="child")], ), + ( + "forward_labels", + [ + {"id": "acct1", "type": "account"}, + {"id": "acct2", "type": "account"}, + {"id": "acct3", "type": "account"}, + ], + [ + {"edge_id": "e1", "src": "acct1", "dst": "acct2", "type": "txn"}, + {"edge_id": "e2", "src": "acct2", "dst": "acct3", "type": "txn"}, + ], + [ + n({"type": "account"}, name="start"), + e_forward( + {"type": "txn"}, + name="hop", + label_node_hops="node_hop", + label_edge_hops="edge_hop", + label_seeds=True, + ), + n({"type": "account"}, name="end"), + ], + ), + ( + "reverse_two_hop", + [ + {"id": "acct1", "type": "account"}, + {"id": "acct2", "type": "account"}, + {"id": "user1", "type": "user"}, + ], + [ + {"edge_id": "txn1", "src": "acct1", "dst": "acct2", "type": "txn"}, + {"edge_id": "owns1", "src": "acct2", "dst": "user1", "type": "owns"}, + ], + [ + n({"type": "user"}, name="user_end"), + e_reverse({"type": "owns"}, name="owns_rev"), + n({"type": "account"}, name="acct_mid"), + e_reverse({"type": "txn"}, name="txn_rev"), + n({"type": "account"}, name="acct_start"), + ], + ), ] @pytest.mark.parametrize("_, nodes, edges, ops", CASES, ids=[case[0] for case in CASES]) def test_enumerator_matches_gfql(_, nodes, edges, ops): _run_parity_case(nodes, edges, ops) + + +def test_enumerator_min_max_three_branch_unlabeled(): + nodes = [ + {"id": "a"}, + {"id": "b1"}, + {"id": "c1"}, + {"id": "d1"}, + {"id": "e1"}, + {"id": "b2"}, + {"id": "c2"}, + ] + edges = [ + {"edge_id": "e1", "src": "a", "dst": "b1"}, + {"edge_id": "e2", "src": "b1", "dst": "c1"}, + {"edge_id": "e3", "src": "c1", "dst": "d1"}, + {"edge_id": "e4", "src": "d1", "dst": "e1"}, + {"edge_id": "e5", "src": "a", "dst": "b2"}, + {"edge_id": "e6", "src": "b2", "dst": "c2"}, + ] + ops = [ + n({"id": "a"}), + e_forward(min_hops=3, max_hops=3), + n(), + ] + _run_parity_case(nodes, edges, ops) + + +# ============================================================================ +# TRICKY PARITY TESTS - Exercise edge cases for hop bounds/labels +# ============================================================================ + + +class TestTrickyHopBounds: + """Test cases designed to catch subtle bugs in hop bounds and label logic.""" + + def test_dead_end_branch_pruning(self): + """min_hops should prune branches that don't reach the minimum. + + Graph: + a -> b -> c -> d (3 edges, reaches hop 3) + a -> x (1 edge, dead end at hop 1) + + With min_hops=2, the a->x branch should be pruned. + """ + nodes = [ + {"id": "a"}, + {"id": "b"}, + {"id": "c"}, + {"id": "d"}, + {"id": "x"}, + ] + edges = [ + {"edge_id": "e1", "src": "a", "dst": "b"}, + {"edge_id": "e2", "src": "b", "dst": "c"}, + {"edge_id": "e3", "src": "c", "dst": "d"}, + {"edge_id": "dead", "src": "a", "dst": "x"}, + ] + ops = [ + n({"id": "a"}), + e_forward(min_hops=2, max_hops=3, label_node_hops="hop", label_edge_hops="ehop"), + n(), + ] + oracle = _run_parity_case(nodes, edges, ops, check_hop_labels=True) + # x and dead edge should not be in output + assert "x" not in set(oracle.nodes["id"]) + assert "dead" not in set(oracle.edges["edge_id"]) + + def test_output_slice_vs_traversal_bounds(self): + """output_min/max should filter output without affecting traversal. + + Graph: a -> b -> c -> d -> e (linear, 4 edges) + + With min_hops=1, max_hops=4, output_min_hops=2, output_max_hops=3: + - Traversal reaches all nodes + - Output includes edges at hop 2-3 (e2, e3) + - Output includes nodes that are endpoints of those edges (b, c, d) + - Node hop labels only set for nodes within slice (c=2, d=3), others NA + """ + nodes = [{"id": x} for x in ["a", "b", "c", "d", "e"]] + edges = [ + {"edge_id": "e1", "src": "a", "dst": "b"}, + {"edge_id": "e2", "src": "b", "dst": "c"}, + {"edge_id": "e3", "src": "c", "dst": "d"}, + {"edge_id": "e4", "src": "d", "dst": "e"}, + ] + ops = [ + n({"id": "a"}), + e_forward( + min_hops=1, + max_hops=4, + output_min_hops=2, + output_max_hops=3, + label_node_hops="hop", + label_edge_hops="ehop", + ), + n(), + ] + oracle = _run_parity_case(nodes, edges, ops, check_hop_labels=True) + + # Edges at hop 2-3 should be in output + output_edges = set(oracle.edges["edge_id"]) + assert "e1" not in output_edges # hop 1 + assert "e2" in output_edges # hop 2 + assert "e3" in output_edges # hop 3 + assert "e4" not in output_edges # hop 4 + + # Nodes that are endpoints of kept edges + output_nodes = set(oracle.nodes["id"]) + assert "a" not in output_nodes # not an endpoint of e2 or e3 + assert "b" in output_nodes # source of e2 + assert "c" in output_nodes # dst of e2, source of e3 + assert "d" in output_nodes # dst of e3 + assert "e" not in output_nodes # not an endpoint of kept edges + + # Only nodes at hop 2-3 have hop labels + assert oracle.node_hop_labels is not None + assert oracle.node_hop_labels.get("c") == 2 + assert oracle.node_hop_labels.get("d") == 3 + assert "b" not in oracle.node_hop_labels # hop 1, outside slice + + def test_label_seeds_true(self): + """label_seeds=True should label seed nodes with hop=0.""" + nodes = [{"id": x} for x in ["seed", "b", "c"]] + edges = [ + {"edge_id": "e1", "src": "seed", "dst": "b"}, + {"edge_id": "e2", "src": "b", "dst": "c"}, + ] + ops = [ + n({"id": "seed"}), + e_forward( + min_hops=1, + max_hops=2, + label_node_hops="hop", + label_seeds=True, + ), + n(), + ] + oracle = _run_parity_case(nodes, edges, ops, check_hop_labels=True) + # Seed should have hop=0 + assert oracle.node_hop_labels is not None + assert oracle.node_hop_labels.get("seed") == 0 + assert oracle.node_hop_labels.get("b") == 1 + assert oracle.node_hop_labels.get("c") == 2 + + def test_label_seeds_false(self): + """label_seeds=False should not label seed nodes (hop=NA).""" + nodes = [{"id": x} for x in ["seed", "b", "c"]] + edges = [ + {"edge_id": "e1", "src": "seed", "dst": "b"}, + {"edge_id": "e2", "src": "b", "dst": "c"}, + ] + ops = [ + n({"id": "seed"}), + e_forward( + min_hops=1, + max_hops=2, + label_node_hops="hop", + label_seeds=False, + ), + n(), + ] + oracle = _run_parity_case(nodes, edges, ops, check_hop_labels=True) + # Seed should NOT have hop label (or not be 0) + assert oracle.node_hop_labels is not None + assert "seed" not in oracle.node_hop_labels or oracle.node_hop_labels.get("seed") != 0 + + def test_cycle_with_bounds(self): + """Cycles should handle hop bounds correctly. + + Graph: a -> b -> c -> a (triangle cycle) + + With min_hops=2, max_hops=3, starting at a: + - Can reach b at hop 1 + - Can reach c at hop 2 + - Can reach a again at hop 3 + """ + nodes = [{"id": x} for x in ["a", "b", "c"]] + edges = [ + {"edge_id": "e1", "src": "a", "dst": "b"}, + {"edge_id": "e2", "src": "b", "dst": "c"}, + {"edge_id": "e3", "src": "c", "dst": "a"}, + ] + ops = [ + n({"id": "a"}), + e_forward(min_hops=2, max_hops=3, label_node_hops="hop", label_edge_hops="ehop"), + n(), + ] + oracle = _run_parity_case(nodes, edges, ops, check_hop_labels=True) + # All nodes should be reachable + assert set(oracle.nodes["id"]) == {"a", "b", "c"} + + def test_branching_path_lengths(self): + """Test behavior with branching paths of different lengths. + + Graph: + a -> b -> c -> d (3 hops to d via long path) + a -> x -> d (2 hops to d via short path) + + With min_hops=3, max_hops=3, d is reachable at hop 3 (via the long path). + Both paths are explored during traversal, since: + - a->b->c->d: 3 hops - meets min_hops=3 requirement + - a->x->d: 2 hops - but x and d are still reachable in the graph + + Note: GFQL semantics include all reachable nodes/edges where at least + one path satisfies the hop bounds. This is a parity test against GFQL. + """ + nodes = [{"id": x} for x in ["a", "b", "c", "d", "x"]] + edges = [ + {"edge_id": "e1", "src": "a", "dst": "b"}, + {"edge_id": "e2", "src": "b", "dst": "c"}, + {"edge_id": "e3", "src": "c", "dst": "d"}, + {"edge_id": "short1", "src": "a", "dst": "x"}, + {"edge_id": "short2", "src": "x", "dst": "d"}, + ] + ops = [ + n({"id": "a"}), + e_forward(min_hops=3, max_hops=3, label_node_hops="hop"), + n(), + ] + # This is a parity test - just verify oracle matches GFQL + _run_parity_case(nodes, edges, ops, check_hop_labels=True) + + def test_reverse_with_bounds(self): + """Reverse traversal with bounds should work correctly. + + Graph: a -> b -> c -> d + + Starting at d, e_reverse, min_hops=2, max_hops=2: + - Reverse traversal: d <- c <- b <- a + - hop 1: c, hop 2: b, hop 3: a + - Valid destination: b (at hop 2) + - All paths to b are included: d->c->b, so c is included as intermediate + - a is NOT included because it's hop 3 (beyond max_hops=2) + """ + nodes = [{"id": x} for x in ["a", "b", "c", "d"]] + edges = [ + {"edge_id": "e1", "src": "a", "dst": "b"}, + {"edge_id": "e2", "src": "b", "dst": "c"}, + {"edge_id": "e3", "src": "c", "dst": "d"}, + ] + ops = [ + n({"id": "d"}), + e_reverse(min_hops=2, max_hops=2, label_node_hops="hop"), + n(), + ] + oracle = _run_parity_case(nodes, edges, ops, check_hop_labels=True) + output_nodes = set(oracle.nodes["id"]) + # b is reachable at exactly 2 reverse hops (valid destination) + assert "b" in output_nodes + # c is included as intermediate node on path to b + assert "c" in output_nodes + # a is at hop 3, beyond max_hops, not included + assert "a" not in output_nodes + + def test_undirected_with_output_slice(self): + """Undirected traversal with output slicing. + + Graph: a -- b -- c -- d (undirected) + + Starting at b, e_undirected, max_hops=2, output_min_hops=2: + - Reaches a,c at hop 1 + - Reaches d at hop 2 (from c) + - Edge e3 (c->d) is at hop 2, so it's kept + - Output edges: e3 + - Output nodes: endpoints of e3 (c, d) + - Node d has hop=2 (valid), c has hop=NA (outside slice) + """ + nodes = [{"id": x} for x in ["a", "b", "c", "d"]] + edges = [ + {"edge_id": "e1", "src": "a", "dst": "b"}, + {"edge_id": "e2", "src": "b", "dst": "c"}, + {"edge_id": "e3", "src": "c", "dst": "d"}, + ] + ops = [ + n({"id": "b"}), + e_undirected(max_hops=2, output_min_hops=2, label_node_hops="hop"), + n(), + ] + oracle = _run_parity_case(nodes, edges, ops, check_hop_labels=True) + output_nodes = set(oracle.nodes["id"]) + output_edges = set(oracle.edges["edge_id"]) + # Only edge e3 (hop 2) is in output + assert "e3" in output_edges + assert "e1" not in output_edges # hop 1 + assert "e2" not in output_edges # hop 1 + # Nodes: endpoints of kept edge e3 + assert "c" in output_nodes # source of e3 + assert "d" in output_nodes # dest of e3 + assert "a" not in output_nodes # not endpoint of e3 + + def test_empty_result_unreachable_bounds(self): + """When bounds can't be satisfied, result should be empty. + + Graph: a -> b (1 edge) + + With min_hops=5, max_hops=10: nothing is reachable. + """ + nodes = [{"id": x} for x in ["a", "b"]] + edges = [{"edge_id": "e1", "src": "a", "dst": "b"}] + ops = [ + n({"id": "a"}), + e_forward(min_hops=5, max_hops=10), + n(), + ] + oracle = _run_parity_case(nodes, edges, ops) + assert oracle.nodes.empty or len(oracle.nodes) == 0 + assert oracle.edges.empty or len(oracle.edges) == 0 diff --git a/tests/gfql/ref/test_ref_enumerator.py b/tests/gfql/ref/test_ref_enumerator.py index 81af62ef78..3dc23d0f25 100644 --- a/tests/gfql/ref/test_ref_enumerator.py +++ b/tests/gfql/ref/test_ref_enumerator.py @@ -203,6 +203,38 @@ def test_paths_are_deterministically_sorted(): assert tuples == sorted(tuples) +def test_enumerator_min_max_three_branch_unlabeled(): + nodes = pd.DataFrame( + [ + {"id": "a"}, + {"id": "b1"}, + {"id": "c1"}, + {"id": "d1"}, + {"id": "e1"}, + {"id": "b2"}, + {"id": "c2"}, + ] + ) + edges = pd.DataFrame( + [ + {"edge_id": "e1", "src": "a", "dst": "b1"}, + {"edge_id": "e2", "src": "b1", "dst": "c1"}, + {"edge_id": "e3", "src": "c1", "dst": "d1"}, + {"edge_id": "e4", "src": "d1", "dst": "e1"}, + {"edge_id": "e5", "src": "a", "dst": "b2"}, + {"edge_id": "e6", "src": "b2", "dst": "c2"}, + ] + ) + g = _plottable(nodes, edges) + result = enumerate_chain( + g, + [n({"id": "a"}), e_forward(min_hops=3, max_hops=3), n()], + caps=OracleCaps(max_nodes=20, max_edges=20), + ) + assert _col_set(result.nodes, "id") == {"a", "b1", "c1", "d1"} + assert _col_set(result.edges, "edge_id") == {"e1", "e2", "e3"} + + NODE_POOL = [f"n{i}" for i in range(6)] EDGE_POOL = [f"e{i}" for i in range(8)]