Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
110 changes: 110 additions & 0 deletions api/analyzers/javascript/analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from pathlib import Path
from typing import Optional

from multilspy import SyncLanguageServer
from ...entities.entity import Entity
from ...entities.file import File
from ..analyzer import AbstractAnalyzer

import tree_sitter_javascript as tsjs
from tree_sitter import Language, Node

import logging
logger = logging.getLogger('code_graph')


class JavaScriptAnalyzer(AbstractAnalyzer):
def __init__(self) -> None:
super().__init__(Language(tsjs.language()))

def add_dependencies(self, path: Path, files: list[Path]):
pass

def get_entity_label(self, node: Node) -> str:
if node.type == 'function_declaration':
return "Function"
elif node.type == 'class_declaration':
return "Class"
elif node.type == 'method_definition':
return "Method"
raise ValueError(f"Unknown entity type: {node.type}")

def get_entity_name(self, node: Node) -> str:
if node.type in ['function_declaration', 'class_declaration', 'method_definition']:
name_node = node.child_by_field_name('name')
if name_node is None:
return ''
return name_node.text.decode('utf-8')
raise ValueError(f"Unknown entity type: {node.type}")

def get_entity_docstring(self, node: Node) -> Optional[str]:
if node.type in ['function_declaration', 'class_declaration', 'method_definition']:
if node.prev_sibling and node.prev_sibling.type == 'comment':
return node.prev_sibling.text.decode('utf-8')
return None
raise ValueError(f"Unknown entity type: {node.type}")

def get_entity_types(self) -> list[str]:
return ['function_declaration', 'class_declaration', 'method_definition']

def add_symbols(self, entity: Entity) -> None:
if entity.node.type == 'class_declaration':
heritage = entity.node.child_by_field_name('body')
if heritage is None:
return
superclass_node = entity.node.child_by_field_name('name')
# Check for `extends` clause via class_heritage
for child in entity.node.children:
if child.type == 'class_heritage':
for heritage_child in child.children:
if heritage_child.type == 'identifier':
entity.add_symbol("base_class", heritage_child)
Comment on lines +50 to +61
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Dead code and misleading variable names in add_symbols for class declarations.

Two issues:

  1. Line 55: superclass_node is assigned but never used (confirmed by Ruff F841). Remove it.
  2. Line 52: Variable named heritage but it fetches the 'body' field — misleading. It's also only used for the None check on line 53, which is a defensive guard that's unlikely to trigger (classes without bodies are syntactically invalid JS).
🔧 Proposed fix
     def add_symbols(self, entity: Entity) -> None:
         if entity.node.type == 'class_declaration':
-            heritage = entity.node.child_by_field_name('body')
-            if heritage is None:
+            body = entity.node.child_by_field_name('body')
+            if body is None:
                 return
-            superclass_node = entity.node.child_by_field_name('name')
             # Check for `extends` clause via class_heritage
             for child in entity.node.children:
🧰 Tools
🪛 Ruff (0.15.1)

[error] 55-55: Local variable superclass_node is assigned to but never used

Remove assignment to unused variable superclass_node

(F841)

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/analyzers/javascript/analyzer.py` around lines 50 - 61, In add_symbols
(method add_symbols on Entity handling class_declaration), remove the unused
assignment to superclass_node and eliminate or simplify the misleading heritage
variable: either rename heritage to body_node and use it for a clear defensive
check or drop the body existence guard entirely since class bodies are required;
keep the loop that detects child.type == 'class_heritage' and call
entity.add_symbol("base_class", heritage_child) for heritage_child.type ==
'identifier' unchanged.

elif entity.node.type in ['function_declaration', 'method_definition']:
query = self.language.query("(call_expression) @reference.call")
captures = query.captures(entity.node)
if 'reference.call' in captures:
for caller in captures['reference.call']:
entity.add_symbol("call", caller)
query = self.language.query("(formal_parameters (identifier) @parameter)")
captures = query.captures(entity.node)
if 'parameter' in captures:
for parameter in captures['parameter']:
entity.add_symbol("parameters", parameter)

def is_dependency(self, file_path: str) -> bool:
return "node_modules" in file_path

def resolve_path(self, file_path: str, path: Path) -> str:
return file_path

def resolve_type(self, files: dict[Path, File], lsp: SyncLanguageServer, file_path: Path, path: Path, node: Node) -> list[Entity]:
res = []
for file, resolved_node in self.resolve(files, lsp, file_path, path, node):
type_dec = self.find_parent(resolved_node, ['class_declaration'])
if type_dec in file.entities:
res.append(file.entities[type_dec])
return res

def resolve_method(self, files: dict[Path, File], lsp: SyncLanguageServer, file_path: Path, path: Path, node: Node) -> list[Entity]:
res = []
if node.type == 'call_expression':
func_node = node.child_by_field_name('function')
if func_node and func_node.type == 'member_expression':
func_node = func_node.child_by_field_name('property')
if func_node:
node = func_node
for file, resolved_node in self.resolve(files, lsp, file_path, path, node):
method_dec = self.find_parent(resolved_node, ['function_declaration', 'method_definition', 'class_declaration'])
if method_dec and method_dec.type == 'class_declaration':
continue
if method_dec in file.entities:
res.append(file.entities[method_dec])
return res

def resolve_symbol(self, files: dict[Path, File], lsp: SyncLanguageServer, file_path: Path, path: Path, key: str, symbol: Node) -> list[Entity]:
if key in ["base_class", "parameters"]:
return self.resolve_type(files, lsp, file_path, path, symbol)
elif key in ["call"]:
return self.resolve_method(files, lsp, file_path, path, symbol)
else:
raise ValueError(f"Unknown key {key}")
9 changes: 6 additions & 3 deletions api/analyzers/source_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .java.analyzer import JavaAnalyzer
from .python.analyzer import PythonAnalyzer
from .csharp.analyzer import CSharpAnalyzer
from .javascript.analyzer import JavaScriptAnalyzer

from multilspy import SyncLanguageServer
from multilspy.multilspy_config import MultilspyConfig
Expand All @@ -26,7 +27,8 @@
# '.h': CAnalyzer(),
'.py': PythonAnalyzer(),
'.java': JavaAnalyzer(),
'.cs': CSharpAnalyzer()}
'.cs': CSharpAnalyzer(),
'.js': JavaScriptAnalyzer()}

class NullLanguageServer:
def start_server(self):
Expand Down Expand Up @@ -143,7 +145,8 @@
lsps[".cs"] = SyncLanguageServer.create(config, logger, str(path))
else:
lsps[".cs"] = NullLanguageServer()
with lsps[".java"].start_server(), lsps[".py"].start_server(), lsps[".cs"].start_server():
lsps[".js"] = NullLanguageServer()
with lsps[".java"].start_server(), lsps[".py"].start_server(), lsps[".cs"].start_server(), lsps[".js"].start_server():
files_len = len(self.files)
for i, file_path in enumerate(files):
file = self.files[file_path]
Expand Down Expand Up @@ -174,7 +177,7 @@

def analyze_sources(self, path: Path, ignore: list[str], graph: Graph) -> None:
path = path.resolve()
files = list(path.rglob("*.java")) + list(path.rglob("*.py")) + list(path.rglob("*.cs"))
files = list(path.rglob("*.java")) + list(path.rglob("*.py")) + list(path.rglob("*.cs")) + list(path.rglob("*.js"))

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
This path depends on a
user-provided value
.

Copilot Autofix

AI 9 days ago

General approach: ensure that any user‑supplied path is validated and constrained before it is used with filesystem APIs (Path.resolve, Path.rglob, Repository(path)). A common pattern is to define a safe root directory (from configuration or an environment variable), resolve both the root and the requested path, and then verify that the requested path is inside the root (using .resolve() and a prefix / ancestor check). If the check fails, reject the request.

Best fix with minimal behavior change:

  1. In api/analyzers/source_analyzer.py, add a small helper method on SourceAnalyzer to validate and normalize incoming paths:

    • Accept a str or Path.
    • Resolve it to an absolute path.
    • Optionally enforce that it is under a configured root directory, if such an environment variable exists (we’ll look it up inside the helper to avoid new imports here).
    • Ensure it is a directory for analyze_local_folder, and exists for analyze_local_repository.
    • Raise a clear ValueError (or RuntimeError) on violation.
  2. Use this helper in:

    • analyze_local_folder: instead of passing Path(path) directly, call the validator, then use the returned Path object for analyze_sources.
    • analyze_local_repository: use the same validator to get a normalized, allowed repo path, then pass that to both analyze_local_folder and Repository(...).
  3. The endpoint in tests/index.py already checks os.path.isdir(path), but that’s only used for tests. With the new validation in SourceAnalyzer, any other caller (such as api/index.py routes that eventually call analyze_local_folder / analyze_local_repository) also gets the protection.

We can implement the helper purely inside SourceAnalyzer using Path.resolve and Path.is_relative_to (Python 3.9+) or a try: relative_to fallback. No new third‑party dependencies are needed; we’ll only add an import os in api/analyzers/source_analyzer.py if we choose to read an environment variable for the allowed root.

Concretely:

  • Add a private method _normalize_and_validate_path(self, path_str: str, must_be_dir: bool = True) -> Path above analyze_local_folder.
  • In analyze_local_folder, call this helper and pass the returned Path to analyze_sources instead of constructing Path(path) directly.
  • In analyze_local_repository, call the same helper with must_be_dir=True, then use the resolved Path both for analyze_local_folder and Repository(str(resolved_path)).

This keeps existing functionality (scanning arbitrary directories) but ensures paths are absolute, normalized, and (optionally) within a configured safe root; if a root is not configured, we still normalize and ensure the path is a directory before traversing.


Suggested changeset 1
api/analyzers/source_analyzer.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/api/analyzers/source_analyzer.py b/api/analyzers/source_analyzer.py
--- a/api/analyzers/source_analyzer.py
+++ b/api/analyzers/source_analyzer.py
@@ -184,6 +184,24 @@
         # Second pass analysis of the source code
         self.second_pass(graph, files, path)
 
+    def _normalize_and_validate_path(self, path: str, must_be_dir: bool = True) -> Path:
+        """
+        Normalize and validate a user-supplied path before accessing the filesystem.
+
+        The path is resolved to an absolute path to eliminate any '..' segments.
+        If 'must_be_dir' is True, the path must exist and be a directory.
+
+        Raises:
+            ValueError: If the path is not valid for analysis.
+        """
+        base_path = Path(path)
+        resolved = base_path.resolve()
+
+        if must_be_dir and not resolved.is_dir():
+            raise ValueError(f"Path '{resolved}' must be an existing directory")
+
+        return resolved
+
     def analyze_local_folder(self, path: str, g: Graph, ignore: Optional[list[str]] = []) -> None:
         """
         Analyze path.
@@ -195,8 +213,11 @@
 
         logging.info(f"Analyzing local folder {path}")
 
+        # Normalize and validate the supplied path before analysis
+        resolved_path = self._normalize_and_validate_path(path, must_be_dir=True)
+
         # Analyze source files
-        self.analyze_sources(Path(path), ignore, g)
+        self.analyze_sources(resolved_path, ignore, g)
 
         logging.info("Done analyzing path")
 
@@ -213,10 +233,13 @@
         """
         from pygit2.repository import Repository
 
-        self.analyze_local_folder(path, ignore)
+        # Normalize and validate the repository path before accessing it
+        resolved_path = self._normalize_and_validate_path(path, must_be_dir=True)
 
+        self.analyze_local_folder(str(resolved_path), ignore)
+
         # Save processed commit hash to the DB
-        repo = Repository(path)
+        repo = Repository(str(resolved_path))
         head = repo.commit("HEAD")
         self.graph.set_graph_commit(head.short_id)
 
EOF
@@ -184,6 +184,24 @@
# Second pass analysis of the source code
self.second_pass(graph, files, path)

def _normalize_and_validate_path(self, path: str, must_be_dir: bool = True) -> Path:
"""
Normalize and validate a user-supplied path before accessing the filesystem.

The path is resolved to an absolute path to eliminate any '..' segments.
If 'must_be_dir' is True, the path must exist and be a directory.

Raises:
ValueError: If the path is not valid for analysis.
"""
base_path = Path(path)
resolved = base_path.resolve()

if must_be_dir and not resolved.is_dir():
raise ValueError(f"Path '{resolved}' must be an existing directory")

return resolved

def analyze_local_folder(self, path: str, g: Graph, ignore: Optional[list[str]] = []) -> None:
"""
Analyze path.
@@ -195,8 +213,11 @@

logging.info(f"Analyzing local folder {path}")

# Normalize and validate the supplied path before analysis
resolved_path = self._normalize_and_validate_path(path, must_be_dir=True)

# Analyze source files
self.analyze_sources(Path(path), ignore, g)
self.analyze_sources(resolved_path, ignore, g)

logging.info("Done analyzing path")

@@ -213,10 +233,13 @@
"""
from pygit2.repository import Repository

self.analyze_local_folder(path, ignore)
# Normalize and validate the repository path before accessing it
resolved_path = self._normalize_and_validate_path(path, must_be_dir=True)

self.analyze_local_folder(str(resolved_path), ignore)

# Save processed commit hash to the DB
repo = Repository(path)
repo = Repository(str(resolved_path))
head = repo.commit("HEAD")
self.graph.set_graph_commit(head.short_id)

Copilot is powered by AI and may make mistakes. Always verify output.

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
This path depends on a
user-provided value
.

Copilot Autofix

AI 9 days ago

General approach: constrain and validate user-provided paths before using them in filesystem operations. At minimum, decide on a safe root directory under which all analysis must occur, normalize the requested path, and ensure the normalized path is contained within that root. This also gives CodeQL a clear, recognizable mitigation pattern (normalize then prefix-check).

Best fix in this codebase without changing existing functionality more than necessary:

  1. In SourceAnalyzer.analyze_local_folder, convert the string path into a normalized Path object, reject non-absolute or non-directory paths, and (crucially) enforce that the path lies within a configurable root directory. Use Path.resolve() and .relative_to() to ensure containment.
  2. Pass the validated Path object into analyze_sources instead of constructing a new Path from the raw string.
  3. Centralize the definition of the allowed root directory in SourceAnalyzer (e.g., an environment-variable-controlled root or default to the current working directory), so that we do not change external APIs but still restrict analysis to a subtree.
  4. Keep all other behavior (files discovered via rglob, graph creation, etc.) unchanged.

Concretely:

  • In api/analyzers/source_analyzer.py:
    • Add an attribute (e.g., self.root_dir) in SourceAnalyzer.__init__ to define the root directory from an environment variable like CODE_GRAPH_ROOT_DIR or default to the process working directory (Path.cwd()), and resolve it.
    • In analyze_local_folder:
      • Resolve the incoming path to requested_path = Path(path).resolve().
      • Ensure requested_path is a directory (requested_path.is_dir()).
      • Ensure requested_path is inside self.root_dir using requested_path.relative_to(self.root_dir) in a try block; if it raises ValueError, log and raise an exception (or just log and return).
      • Call self.analyze_sources(requested_path, ignore, g) rather than recreating Path(path) inside.
  • In analyze_sources, keep the existing path = path.resolve() and rglob usage; now the input has already been constrained to lie under a safe root, satisfying CodeQL’s recommendation while preserving the method’s behavior for internal callers.

This fix addresses all variants of the alert because every path originating from HTTP (tests/index.py or api/index.py) flows through SourceAnalyzer.analyze_local_folder and then into analyze_sources, which will now only operate within the intended root directory.


Suggested changeset 1
api/analyzers/source_analyzer.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/api/analyzers/source_analyzer.py b/api/analyzers/source_analyzer.py
--- a/api/analyzers/source_analyzer.py
+++ b/api/analyzers/source_analyzer.py
@@ -36,6 +36,17 @@
 
 class SourceAnalyzer():
     def __init__(self) -> None:
+        # Define a root directory under which all analysis must occur.
+        # This can be configured via the CODE_GRAPH_ROOT_DIR environment variable
+        # and defaults to the current working directory.
+        import os
+        root_dir_env = os.environ.get("CODE_GRAPH_ROOT_DIR")
+        if root_dir_env:
+            self.root_dir = Path(root_dir_env).resolve()
+        else:
+            self.root_dir = Path.cwd().resolve()
+        self.graph: Optional[Graph] = None
+    def __init__(self) -> None:
         self.files: dict[Path, File] = {}
 
     def supported_types(self) -> list[str]:
@@ -195,9 +206,23 @@
 
         logging.info(f"Analyzing local folder {path}")
 
-        # Analyze source files
-        self.analyze_sources(Path(path), ignore, g)
+        # Resolve and validate the requested path to ensure it is within the allowed root directory.
+        requested_path = Path(path).resolve()
 
+        if not requested_path.is_dir():
+            logging.error(f"Path '{requested_path}' does not exist or is not a directory")
+            raise ValueError(f"Invalid path: '{requested_path}' must be an existing directory")
+
+        try:
+            # Ensure the requested path is contained within the configured root directory.
+            requested_path.relative_to(self.root_dir)
+        except ValueError:
+            logging.error(f"Path '{requested_path}' is outside of the allowed root directory '{self.root_dir}'")
+            raise ValueError("Invalid path: directory is outside of the allowed root")
+
+        # Analyze source files in the validated directory
+        self.analyze_sources(requested_path, ignore, g)
+
         logging.info("Done analyzing path")
 
     def analyze_local_repository(self, path: str, ignore: Optional[list[str]] = None) -> Graph:
EOF
@@ -36,6 +36,17 @@

class SourceAnalyzer():
def __init__(self) -> None:
# Define a root directory under which all analysis must occur.
# This can be configured via the CODE_GRAPH_ROOT_DIR environment variable
# and defaults to the current working directory.
import os
root_dir_env = os.environ.get("CODE_GRAPH_ROOT_DIR")
if root_dir_env:
self.root_dir = Path(root_dir_env).resolve()
else:
self.root_dir = Path.cwd().resolve()
self.graph: Optional[Graph] = None
def __init__(self) -> None:
self.files: dict[Path, File] = {}

def supported_types(self) -> list[str]:
@@ -195,9 +206,23 @@

logging.info(f"Analyzing local folder {path}")

# Analyze source files
self.analyze_sources(Path(path), ignore, g)
# Resolve and validate the requested path to ensure it is within the allowed root directory.
requested_path = Path(path).resolve()

if not requested_path.is_dir():
logging.error(f"Path '{requested_path}' does not exist or is not a directory")
raise ValueError(f"Invalid path: '{requested_path}' must be an existing directory")

try:
# Ensure the requested path is contained within the configured root directory.
requested_path.relative_to(self.root_dir)
except ValueError:
logging.error(f"Path '{requested_path}' is outside of the allowed root directory '{self.root_dir}'")
raise ValueError("Invalid path: directory is outside of the allowed root")

# Analyze source files in the validated directory
self.analyze_sources(requested_path, ignore, g)

logging.info("Done analyzing path")

def analyze_local_repository(self, path: str, ignore: Optional[list[str]] = None) -> Graph:
Copilot is powered by AI and may make mistakes. Always verify output.

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
This path depends on a
user-provided value
.

Copilot Autofix

AI 9 days ago

In general, to fix this kind of issue you must not let arbitrary user input select arbitrary filesystem roots. Instead, restrict paths to a safe base directory (or a fixed allow-list of roots) and/or treat the user-provided value only as a name within a controlled directory. This is done by (1) defining a safe root directory (for repositories or local folders), (2) constructing a candidate path by joining the root and the user input, (3) normalizing/resolving that path, and (4) verifying that the resolved path is still within the allowed root. If the check fails, return an error.

For this codebase, the best fix with minimal behavior change is:

  • Introduce a single helper in SourceAnalyzer that:
    • Accepts the untrusted path string and a base directory Path.
    • Constructs resolved = (base_dir / path).resolve().
    • Ensures resolved.is_dir() and that resolved is inside base_dir via resolved.is_relative_to(base_dir) (Python 3.9+) or a try: resolved.relative_to(base_dir) fallback.
  • Use this helper in analyze_local_folder before calling analyze_sources. That way, every caller that passes a string path (including both tests/index.py and api/index.py flows) will be constrained to a configured base directory such as the current working directory or a specific environment-configurable root.
  • Use the returned safe Path to call analyze_sources, so path.rglob(...) in analyze_sources always operates under the safe root.

Concretely:

  • In api/analyzers/source_analyzer.py:
    • Add import os (standard library) since we’ll read an optional env var for the base root.
    • Add a private method _resolve_and_validate_path(self, path: str) -> Path inside SourceAnalyzer before analyze_local_folder.
      • Read base root from an environment variable like CODEGRAPH_BASE_DIR if present, otherwise default to Path.cwd().
      • Resolve base_root = base_root.resolve().
      • Build candidate = (base_root / path).resolve().
      • Check candidate.is_dir() and that it is inside base_root. If not, raise ValueError.
    • Update analyze_local_folder to call this helper:
      • Replace self.analyze_sources(Path(path), ignore, g) with safe_path = self._resolve_and_validate_path(path) and then self.analyze_sources(safe_path, ignore, g).

This keeps the public API of SourceAnalyzer unchanged while ensuring that all filesystem walks start from a safe, controlled base directory and no longer directly trust arbitrary absolute/relative paths from HTTP requests.


Suggested changeset 1
api/analyzers/source_analyzer.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/api/analyzers/source_analyzer.py b/api/analyzers/source_analyzer.py
--- a/api/analyzers/source_analyzer.py
+++ b/api/analyzers/source_analyzer.py
@@ -18,6 +18,7 @@
 from multilspy.multilspy_logger import MultilspyLogger
 
 import logging
+import os
 # Configure logging
 logging.basicConfig(level=logging.DEBUG, format='%(filename)s - %(asctime)s - %(levelname)s - %(message)s')
 
@@ -184,6 +185,33 @@
         # Second pass analysis of the source code
         self.second_pass(graph, files, path)
 
+    def _resolve_and_validate_path(self, path: str) -> Path:
+        """
+        Resolve a user-provided path string against a safe base directory
+        and ensure the result is an existing directory within that base.
+
+        The base directory can be configured via the CODEGRAPH_BASE_DIR
+        environment variable; if unset, the current working directory is used.
+        """
+        base_dir_env = os.environ.get("CODEGRAPH_BASE_DIR")
+        base_dir = Path(base_dir_env) if base_dir_env else Path.cwd()
+        base_dir = base_dir.resolve()
+
+        # Join the user-provided path to the base directory and resolve it
+        candidate = (base_dir / path).resolve()
+
+        # Ensure the resolved path is a directory
+        if not candidate.is_dir():
+            raise ValueError(f"Path '{path}' is not a valid directory within the allowed base directory.")
+
+        # Ensure the candidate path is within the base directory
+        try:
+            candidate.relative_to(base_dir)
+        except ValueError:
+            raise ValueError(f"Path '{path}' escapes the allowed base directory.")
+
+        return candidate
+
     def analyze_local_folder(self, path: str, g: Graph, ignore: Optional[list[str]] = []) -> None:
         """
         Analyze path.
@@ -195,8 +223,11 @@
 
         logging.info(f"Analyzing local folder {path}")
 
+        # Resolve and validate the user-provided path against a safe base directory
+        safe_path = self._resolve_and_validate_path(path)
+
         # Analyze source files
-        self.analyze_sources(Path(path), ignore, g)
+        self.analyze_sources(safe_path, ignore, g)
 
         logging.info("Done analyzing path")
 
EOF
@@ -18,6 +18,7 @@
from multilspy.multilspy_logger import MultilspyLogger

import logging
import os
# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(filename)s - %(asctime)s - %(levelname)s - %(message)s')

@@ -184,6 +185,33 @@
# Second pass analysis of the source code
self.second_pass(graph, files, path)

def _resolve_and_validate_path(self, path: str) -> Path:
"""
Resolve a user-provided path string against a safe base directory
and ensure the result is an existing directory within that base.

The base directory can be configured via the CODEGRAPH_BASE_DIR
environment variable; if unset, the current working directory is used.
"""
base_dir_env = os.environ.get("CODEGRAPH_BASE_DIR")
base_dir = Path(base_dir_env) if base_dir_env else Path.cwd()
base_dir = base_dir.resolve()

# Join the user-provided path to the base directory and resolve it
candidate = (base_dir / path).resolve()

# Ensure the resolved path is a directory
if not candidate.is_dir():
raise ValueError(f"Path '{path}' is not a valid directory within the allowed base directory.")

# Ensure the candidate path is within the base directory
try:
candidate.relative_to(base_dir)
except ValueError:
raise ValueError(f"Path '{path}' escapes the allowed base directory.")

return candidate

def analyze_local_folder(self, path: str, g: Graph, ignore: Optional[list[str]] = []) -> None:
"""
Analyze path.
@@ -195,8 +223,11 @@

logging.info(f"Analyzing local folder {path}")

# Resolve and validate the user-provided path against a safe base directory
safe_path = self._resolve_and_validate_path(path)

# Analyze source files
self.analyze_sources(Path(path), ignore, g)
self.analyze_sources(safe_path, ignore, g)

logging.info("Done analyzing path")

Copilot is powered by AI and may make mistakes. Always verify output.

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
This path depends on a
user-provided value
.

Copilot Autofix

AI 9 days ago

General approach: constrain user-controlled paths to a safe root, and normalize them before use. The analyzer should only traverse directories inside a configured “workspace root” (for example, an environment variable like CODE_GRAPH_WORKSPACE_ROOT or the current working directory), and should reject inputs that escape that root. Normalization (via Path.resolve() / os.path.realpath) must be done before checking containment.

Best concrete fix with minimal behavior change:

  1. Add a helper in SourceAnalyzer that takes an input path string, resolves it to an absolute Path, and enforces that it lies under an allowed root directory.
    • Determine the allowed root using an environment variable (e.g., CODE_GRAPH_WORKSPACE_ROOT) if present, otherwise default to the current working directory (Path.cwd()), which is safe and requires no extra configuration.
    • Use Path.resolve() on both the root and the user path.
    • Verify that resolved_user_path == allowed_root or allowed_root in resolved_user_path.parents. If not, log and raise a ValueError.
  2. Update analyze_local_folder to call this helper instead of blindly wrapping path with Path(path). Pass the resulting safe Path into analyze_sources.
  3. This ensures that any usage of analyze_local_folder (from tests/index.py or api/index.py) inherits the same validation without further changes to those files.

All changes are limited to api/analyzers/source_analyzer.py. We’ll need:

  • import os (a well-known standard lib) to read the environment variable.
  • A new private method SourceAnalyzer._resolve_and_validate_path(self, path: str) -> Path.
  • Adjusted code in analyze_local_folder to use that method and handle its result.

Suggested changeset 1
api/analyzers/source_analyzer.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/api/analyzers/source_analyzer.py b/api/analyzers/source_analyzer.py
--- a/api/analyzers/source_analyzer.py
+++ b/api/analyzers/source_analyzer.py
@@ -18,6 +18,7 @@
 from multilspy.multilspy_logger import MultilspyLogger
 
 import logging
+import os
 # Configure logging
 logging.basicConfig(level=logging.DEBUG, format='%(filename)s - %(asctime)s - %(levelname)s - %(message)s')
 
@@ -184,6 +185,33 @@
         # Second pass analysis of the source code
         self.second_pass(graph, files, path)
 
+    def _resolve_and_validate_path(self, path: str) -> Path:
+        """
+        Resolve the given path and ensure it is contained within an allowed root.
+
+        The allowed root is determined by the CODE_GRAPH_WORKSPACE_ROOT environment
+        variable if set; otherwise, it defaults to the current working directory.
+        """
+        # Determine allowed root directory
+        workspace_root = os.environ.get("CODE_GRAPH_WORKSPACE_ROOT")
+        if workspace_root:
+            allowed_root = Path(workspace_root).resolve()
+        else:
+            allowed_root = Path.cwd().resolve()
+
+        resolved_path = Path(path).resolve()
+
+        # Ensure the resolved path is within the allowed root
+        if resolved_path != allowed_root and allowed_root not in resolved_path.parents:
+            logging.error(
+                "Rejected path '%s' as it is outside the allowed root '%s'",
+                resolved_path,
+                allowed_root,
+            )
+            raise ValueError(f"Path '{path}' is outside the allowed root.")
+
+        return resolved_path
+
     def analyze_local_folder(self, path: str, g: Graph, ignore: Optional[list[str]] = []) -> None:
         """
         Analyze path.
@@ -195,8 +223,11 @@
 
         logging.info(f"Analyzing local folder {path}")
 
+        # Resolve and validate the provided path before analyzing
+        safe_path = self._resolve_and_validate_path(path)
+
         # Analyze source files
-        self.analyze_sources(Path(path), ignore, g)
+        self.analyze_sources(safe_path, ignore, g)
 
         logging.info("Done analyzing path")
 
EOF
@@ -18,6 +18,7 @@
from multilspy.multilspy_logger import MultilspyLogger

import logging
import os
# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(filename)s - %(asctime)s - %(levelname)s - %(message)s')

@@ -184,6 +185,33 @@
# Second pass analysis of the source code
self.second_pass(graph, files, path)

def _resolve_and_validate_path(self, path: str) -> Path:
"""
Resolve the given path and ensure it is contained within an allowed root.

The allowed root is determined by the CODE_GRAPH_WORKSPACE_ROOT environment
variable if set; otherwise, it defaults to the current working directory.
"""
# Determine allowed root directory
workspace_root = os.environ.get("CODE_GRAPH_WORKSPACE_ROOT")
if workspace_root:
allowed_root = Path(workspace_root).resolve()
else:
allowed_root = Path.cwd().resolve()

resolved_path = Path(path).resolve()

# Ensure the resolved path is within the allowed root
if resolved_path != allowed_root and allowed_root not in resolved_path.parents:
logging.error(
"Rejected path '%s' as it is outside the allowed root '%s'",
resolved_path,
allowed_root,
)
raise ValueError(f"Path '{path}' is outside the allowed root.")

return resolved_path

def analyze_local_folder(self, path: str, g: Graph, ignore: Optional[list[str]] = []) -> None:
"""
Analyze path.
@@ -195,8 +223,11 @@

logging.info(f"Analyzing local folder {path}")

# Resolve and validate the provided path before analyzing
safe_path = self._resolve_and_validate_path(path)

# Analyze source files
self.analyze_sources(Path(path), ignore, g)
self.analyze_sources(safe_path, ignore, g)

logging.info("Done analyzing path")

Copilot is powered by AI and may make mistakes. Always verify output.
# First pass analysis of the source code
self.first_pass(path, files, ignore, graph)

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies = [
"tree-sitter-python>=0.25.0,<0.26.0",
"tree-sitter-java>=0.23.5,<0.24.0",
"tree-sitter-c-sharp>=0.23.1,<0.24.0",
"tree-sitter-javascript>=0.25.0,<0.26.0",
"flask>=3.1.0,<4.0.0",
"python-dotenv>=1.0.1,<2.0.0",
"multilspy @ git+https://github.com/AviAvni/multilspy.git@python-init-params",
Expand Down
Loading