From b62f2542e6cc251abaf1f38453c3998d0e84e2fd Mon Sep 17 00:00:00 2001 From: praneeth_paikray-data Date: Sun, 29 Mar 2026 19:20:33 +0530 Subject: [PATCH] fix: exclude node_modules and other non-deployable dirs from workspace uploads The upload_folder() and _collect_files() functions were only filtering out hidden files and __pycache__. This meant node_modules, venv, dist, build, and other dependency/build directories got uploaded during app deployments, causing unnecessary slowdowns on the initial upload pass. Added an EXCLUDED_DIRS set that both _collect_files and _collect_directories use to prune the walk early, so os.walk never descends into these trees. Updated deployment docs to document the behavior. Co-authored-by: Isaac --- .../databricks-app-python/4-deployment.md | 10 +++- .../databricks_tools_core/file/workspace.py | 36 ++++++++++-- .../tests/unit/test_workspace.py | 57 +++++++++++++++++++ 3 files changed, 97 insertions(+), 6 deletions(-) diff --git a/databricks-skills/databricks-app-python/4-deployment.md b/databricks-skills/databricks-app-python/4-deployment.md index b318bbdf..0d0ab9f2 100644 --- a/databricks-skills/databricks-app-python/4-deployment.md +++ b/databricks-skills/databricks-app-python/4-deployment.md @@ -35,13 +35,21 @@ env: | FastAPI | `["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]` | | Reflex | `["reflex", "run", "--env", "prod"]` | +### Excluded directories + +When uploading via the SDK's `upload_folder()` / `upload_to_workspace()`, the following directories are automatically skipped to keep uploads fast: + +`node_modules`, `__pycache__`, `.venv`, `venv`, `.tox`, `.pytest_cache`, `.mypy_cache`, `.ruff_cache`, `dist`, `build`, `.eggs`, `*.egg-info` + +If you use `databricks workspace import-dir` directly, it does **not** apply these exclusions. Either clean the directory first or use the SDK upload functions instead. + ### Step 2: Create and Deploy ```bash # Create the app databricks apps create -# Upload source code +# Upload source code (make sure to exclude node_modules, venv, etc.) databricks workspace mkdirs /Workspace/Users//apps/ databricks workspace import-dir . /Workspace/Users//apps/ diff --git a/databricks-tools-core/databricks_tools_core/file/workspace.py b/databricks-tools-core/databricks_tools_core/file/workspace.py index 0b681b4c..8d77bec3 100644 --- a/databricks-tools-core/databricks_tools_core/file/workspace.py +++ b/databricks-tools-core/databricks_tools_core/file/workspace.py @@ -19,6 +19,24 @@ from ..auth import get_workspace_client +# Directories that should never be uploaded to a Databricks workspace. +# These are build artifacts, dependency caches, and virtual environments +# that bloat uploads and slow down deployments. +EXCLUDED_DIRS = frozenset({ + "node_modules", + "__pycache__", + ".venv", + "venv", + ".tox", + ".pytest_cache", + ".mypy_cache", + ".ruff_cache", + "dist", + "build", + ".eggs", + "*.egg-info", +}) + @dataclass class UploadResult: @@ -122,10 +140,15 @@ def _collect_files(local_folder: str) -> List[tuple]: files = [] local_folder = os.path.abspath(local_folder) - for dirpath, _, filenames in os.walk(local_folder): + for dirpath, dirnames, filenames in os.walk(local_folder): + # Prune excluded directories so os.walk doesn't descend into them + dirnames[:] = [ + d for d in dirnames + if not d.startswith(".") and d not in EXCLUDED_DIRS and not d.endswith(".egg-info") + ] + for filename in filenames: - # Skip hidden files and __pycache__ - if filename.startswith(".") or "__pycache__" in dirpath: + if filename.startswith("."): continue local_path = os.path.join(dirpath, filename) @@ -149,8 +172,11 @@ def _collect_directories(local_folder: str) -> List[str]: local_folder = os.path.abspath(local_folder) for dirpath, dirnames, _ in os.walk(local_folder): - # Skip hidden directories and __pycache__ - dirnames[:] = [d for d in dirnames if not d.startswith(".") and d != "__pycache__"] + # Skip hidden directories and common non-deployable directories + dirnames[:] = [ + d for d in dirnames + if not d.startswith(".") and d not in EXCLUDED_DIRS and not d.endswith(".egg-info") + ] for dirname in dirnames: full_path = os.path.join(dirpath, dirname) diff --git a/databricks-tools-core/tests/unit/test_workspace.py b/databricks-tools-core/tests/unit/test_workspace.py index 35adede5..bce11db6 100644 --- a/databricks-tools-core/tests/unit/test_workspace.py +++ b/databricks-tools-core/tests/unit/test_workspace.py @@ -60,6 +60,45 @@ def test_skips_pycache(self, tmp_path): assert len(files) == 1 assert files[0][1] == "file.py" + def test_skips_node_modules(self, tmp_path): + """Should skip node_modules directories.""" + (tmp_path / "app.py").write_text("content") + (tmp_path / "node_modules").mkdir() + (tmp_path / "node_modules" / "lodash").mkdir() + (tmp_path / "node_modules" / "lodash" / "index.js").write_text("module") + + files = _collect_files(str(tmp_path)) + + assert len(files) == 1 + assert files[0][1] == "app.py" + + def test_skips_venv_directories(self, tmp_path): + """Should skip venv and .venv directories.""" + (tmp_path / "app.py").write_text("content") + (tmp_path / "venv").mkdir() + (tmp_path / "venv" / "bin").mkdir() + (tmp_path / "venv" / "bin" / "python").write_text("binary") + (tmp_path / ".venv").mkdir() + (tmp_path / ".venv" / "lib").mkdir() + + files = _collect_files(str(tmp_path)) + + assert len(files) == 1 + assert files[0][1] == "app.py" + + def test_skips_build_artifacts(self, tmp_path): + """Should skip dist and build directories.""" + (tmp_path / "app.py").write_text("content") + (tmp_path / "dist").mkdir() + (tmp_path / "dist" / "bundle.js").write_text("bundled") + (tmp_path / "build").mkdir() + (tmp_path / "build" / "output.js").write_text("built") + + files = _collect_files(str(tmp_path)) + + assert len(files) == 1 + assert files[0][1] == "app.py" + class TestCollectDirectories: """Tests for _collect_directories helper function.""" @@ -86,6 +125,24 @@ def test_skips_hidden_directories(self, tmp_path): assert "visible" in dirs assert ".hidden" not in dirs + def test_skips_excluded_directories(self, tmp_path): + """Should skip node_modules, venv, dist, build, and other excluded dirs.""" + (tmp_path / "src").mkdir() + (tmp_path / "node_modules").mkdir() + (tmp_path / "venv").mkdir() + (tmp_path / "dist").mkdir() + (tmp_path / "build").mkdir() + (tmp_path / "__pycache__").mkdir() + + dirs = _collect_directories(str(tmp_path)) + + assert "src" in dirs + assert "node_modules" not in dirs + assert "venv" not in dirs + assert "dist" not in dirs + assert "build" not in dirs + assert "__pycache__" not in dirs + class TestUploadToWorkspace: """Tests for upload_to_workspace function."""