From cac6fdea56fd5b4add9ca7fc8bd2fe5e6261cda4 Mon Sep 17 00:00:00 2001 From: blalterman <12834389+blalterman@users.noreply.github.com> Date: Tue, 17 Feb 2026 14:50:19 -0500 Subject: [PATCH 1/2] Remove non-functional joblib parallelization from TrendFit (#428) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs(plans): add private-dev + public-release repo plan Documents the two-repo architecture for private development with public releases via rsync-based export script. Co-Authored-By: Claude Opus 4.6 * refactor(fitfunctions): remove joblib parallelization from TrendFit Joblib's loky backend deadlocks on macOS with Python 3.12's spawn start method. The multiprocessing backend fails due to unpicklable closures, and the threading backend provides no CPU-bound speedup. Remove the non-functional parallel code path, related tests, benchmarks, and the performance optional dependency. All Phase 4 features in core.py (residuals use_all, in-place mask ops, xoutside/youtside) are preserved — they have zero joblib dependency. Generated with Claude Code Co-Authored-By: Claude --------- Co-authored-by: Claude Opus 4.6 --- benchmarks/fitfunctions_performance.py | 179 --------- plans/private-dev-public-release-repos.md | 347 ++++++++++++++++++ pyproject.toml | 3 - solarwindpy/fitfunctions/trend_fits.py | 146 +------- .../fitfunctions/test_trend_fits_advanced.py | 222 +---------- 5 files changed, 357 insertions(+), 540 deletions(-) delete mode 100644 benchmarks/fitfunctions_performance.py create mode 100644 plans/private-dev-public-release-repos.md diff --git a/benchmarks/fitfunctions_performance.py b/benchmarks/fitfunctions_performance.py deleted file mode 100644 index 863c01e2..00000000 --- a/benchmarks/fitfunctions_performance.py +++ /dev/null @@ -1,179 +0,0 @@ -#!/usr/bin/env python -"""Benchmark Phase 4 performance optimizations.""" - -import time -import numpy as np -import pandas as pd -import sys -import os - -# Add the parent directory to sys.path to import solarwindpy -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) - -from solarwindpy.fitfunctions import Gaussian -from solarwindpy.fitfunctions.trend_fits import TrendFit - - -def benchmark_trendfit(n_fits=50): - """Compare sequential vs parallel TrendFit performance.""" - print(f"\nBenchmarking with {n_fits} fits...") - - # Create synthetic data that's realistic for fitting - np.random.seed(42) - x = np.linspace(0, 10, 100) - data = pd.DataFrame({ - f'col_{i}': 5 * np.exp(-(x-5)**2/2) + np.random.normal(0, 0.1, 100) - for i in range(n_fits) - }, index=x) - - # Sequential execution - print(" Running sequential...") - tf_seq = TrendFit(data, Gaussian, ffunc1d=Gaussian) - tf_seq.make_ffunc1ds() - - start = time.perf_counter() - tf_seq.make_1dfits(n_jobs=1) - seq_time = time.perf_counter() - start - - # Parallel execution - print(" Running parallel...") - tf_par = TrendFit(data, Gaussian, ffunc1d=Gaussian) - tf_par.make_ffunc1ds() - - start = time.perf_counter() - tf_par.make_1dfits(n_jobs=-1) - par_time = time.perf_counter() - start - - speedup = seq_time / par_time - print(f" Sequential: {seq_time:.2f}s") - print(f" Parallel: {par_time:.2f}s") - print(f" Speedup: {speedup:.1f}x") - - # Verify results match - print(" Verifying results match...") - successful_fits = 0 - for key in tf_seq.ffuncs.index: - if key in tf_par.ffuncs.index: # Both succeeded - seq_popt = tf_seq.ffuncs[key].popt - par_popt = tf_par.ffuncs[key].popt - for param in seq_popt: - np.testing.assert_allclose( - seq_popt[param], par_popt[param], - rtol=1e-10, atol=1e-10 - ) - successful_fits += 1 - - print(f" ✓ {successful_fits} fits verified identical") - - return speedup, successful_fits - - -def benchmark_single_fitfunction(): - """Benchmark single FitFunction to understand baseline performance.""" - print("\nBenchmarking single FitFunction...") - - np.random.seed(42) - x = np.linspace(0, 10, 100) - y = 5 * np.exp(-(x-5)**2/2) + np.random.normal(0, 0.1, 100) - - # Time creation and fitting - start = time.perf_counter() - ff = Gaussian(x, y) - creation_time = time.perf_counter() - start - - start = time.perf_counter() - ff.make_fit() - fit_time = time.perf_counter() - start - - total_time = creation_time + fit_time - - print(f" Creation time: {creation_time*1000:.1f}ms") - print(f" Fitting time: {fit_time*1000:.1f}ms") - print(f" Total time: {total_time*1000:.1f}ms") - - return total_time - - -def check_joblib_availability(): - """Check if joblib is available for parallel processing.""" - try: - import joblib - print(f"✓ joblib {joblib.__version__} available") - - # Check number of cores - import os - n_cores = os.cpu_count() - print(f"✓ {n_cores} CPU cores detected") - return True - except ImportError: - print("✗ joblib not available - only sequential benchmarks will run") - return False - - -if __name__ == "__main__": - print("FitFunctions Phase 4 Performance Benchmark") - print("=" * 50) - - # Check system capabilities - has_joblib = check_joblib_availability() - - # Single fit baseline - single_time = benchmark_single_fitfunction() - - # TrendFit scaling benchmarks - speedups = [] - fit_counts = [] - - test_sizes = [10, 25, 50, 100] - if has_joblib: - # Only run larger tests if joblib is available - test_sizes.extend([200]) - - for n in test_sizes: - expected_seq_time = single_time * n - print(f"\nExpected sequential time for {n} fits: {expected_seq_time:.1f}s") - - try: - speedup, n_successful = benchmark_trendfit(n) - speedups.append(speedup) - fit_counts.append(n_successful) - except Exception as e: - print(f" ✗ Benchmark failed: {e}") - speedups.append(1.0) - fit_counts.append(0) - - # Summary report - print("\n" + "=" * 50) - print("BENCHMARK SUMMARY") - print("=" * 50) - - print(f"Single fit baseline: {single_time*1000:.1f}ms") - - if speedups: - print("\nTrendFit Scaling Results:") - print("Fits | Successful | Speedup") - print("-" * 30) - for i, n in enumerate(test_sizes): - if i < len(speedups): - print(f"{n:4d} | {fit_counts[i]:10d} | {speedups[i]:7.1f}x") - - if has_joblib: - avg_speedup = np.mean(speedups) - best_speedup = max(speedups) - print(f"\nAverage speedup: {avg_speedup:.1f}x") - print(f"Best speedup: {best_speedup:.1f}x") - - # Efficiency analysis - if avg_speedup > 1.5: - print("✓ Parallelization provides significant benefit") - else: - print("⚠ Parallelization benefit limited (overhead or few cores)") - else: - print("\nInstall joblib for parallel processing:") - print(" pip install joblib") - print(" or") - print(" pip install solarwindpy[performance]") - - print("\nTo use parallel fitting in your code:") - print(" tf.make_1dfits(n_jobs=-1) # Use all cores") - print(" tf.make_1dfits(n_jobs=4) # Use 4 cores") \ No newline at end of file diff --git a/plans/private-dev-public-release-repos.md b/plans/private-dev-public-release-repos.md new file mode 100644 index 00000000..c1030769 --- /dev/null +++ b/plans/private-dev-public-release-repos.md @@ -0,0 +1,347 @@ +# Plan: Private Development Repo + Public Release Repo + +## Summary + +Create a private development repo (`blalterman/SolarWindPy-dev`) as the primary working +environment, while keeping the existing public repo (`blalterman/SolarWindPy`) as-is for +releases. All current public content stays public. The private repo is a **superset** of +the public repo, adding research code, private data, and experimental features. + +## Architecture + +``` +GitHub: + blalterman/SolarWindPy (public, existing) <- Release repo + blalterman/SolarWindPy-dev (private, new) <- Development repo + +Local: + ~/observatories/code/ + |-- SolarWindPy/ -> public repo clone (releases only) + +-- SolarWindPy-dev/ -> private repo clone (daily work) +``` + +**Relationship**: Private repo is a superset. It contains everything the public repo has, +plus additional private directories. Development happens in private; curated releases flow +to public via an export script. + +## Evidence & Justification + +### Why two repos instead of branch-based separation? +- Git has no per-remote file filtering. Pushing `master` to two remotes sends ALL files. +- Branch-based approaches (e.g., `develop` with private content, `master` without) require + careful merge hygiene that is error-prone for a solo developer. +- Two repos with an export script is the standard pattern used by Linux, Android, and + corporate open-source projects for private-dev -> public-release workflows. +- Zero risk of accidental private content leakage. + +### Why copy history rather than start fresh? +- The private repo should have full commit history for `git blame`, bisect, and reference. +- Starting from a bare clone means both repos share the same initial history (1,383 commits). +- setuptools_scm works identically in both (same tags, same version derivation). + +### Why keep the public repo unchanged? +- User's explicit requirement: "Everything that currently is public can stay public." +- No external link breakage (PyPI, conda-forge, ReadTheDocs, 4 stars, 3 forks). +- No CI/CD reconfiguration needed for the public repo. +- `.claude/`, `plans/`, `paper/` remain public as they already are. + +## Phase 1: Create Private Repository (~15 min) + +### Step 1.1: Create private repo on GitHub +```bash +gh repo create blalterman/SolarWindPy-dev \ + --private \ + --description "Private development repository for SolarWindPy" +``` + +### Step 1.2: Push full history to private repo +```bash +# Clone the public repo as a bare mirror +git clone --bare git@github.com:blalterman/SolarWindPy.git /tmp/swp-mirror + +# Push everything to the new private repo +cd /tmp/swp-mirror +git push --mirror git@github.com:blalterman/SolarWindPy-dev.git + +# Cleanup +rm -rf /tmp/swp-mirror +``` + +### Step 1.3: Clone private repo locally +```bash +cd ~/observatories/code +git clone git@github.com:blalterman/SolarWindPy-dev.git SolarWindPy-dev +``` + +### Step 1.4: Add public repo as a remote in the private clone +```bash +cd ~/observatories/code/SolarWindPy-dev +git remote add public git@github.com:blalterman/SolarWindPy.git +git fetch public +``` + +**Verification**: `git remote -v` shows both `origin` (private) and `public` (public). + +## Phase 2: Configure Private Repo Structure (~30 min) + +### Step 2.1: Add private-only directories +```bash +cd ~/observatories/code/SolarWindPy-dev +mkdir -p research/{notebooks,analysis,experiments} +mkdir -p private/{data,configs} +``` + +### Step 2.2: Create private content manifest +Create `.private-manifest` (tracked in private repo, excluded from export): +``` +# Paths that exist only in the private repo. +# The export script excludes these when syncing to public. +research/ +private/ +.private-manifest +``` + +### Step 2.3: Update private repo .gitignore +Add to `.gitignore`: +``` +# Private data files (never commit even to private repo) +private/data/**/*.h5 +private/data/**/*.hdf5 +private/configs/secrets.* +``` + +### Step 2.4: Initial commit of private structure +```bash +git add research/ private/ .private-manifest +git commit -m "feat: add private research and data directories + +Establishes private-only directories for research code, analysis +notebooks, experimental features, and private data/configs. +These directories are excluded from public repo syncs. + +Co-Authored-By: Claude " +git push origin master +``` + +## Phase 3: Create Export Script (~1 hour) + +### Step 3.1: Create `scripts/export-to-public.sh` in private repo + +This script: +1. Reads `.private-manifest` for paths to exclude +2. Uses `rsync` to sync everything else to the local public clone +3. Commits and optionally tags in the public clone +4. Pushes to the public GitHub repo + +```bash +#!/bin/bash +# export-to-public.sh -- Sync public-safe content from private to public repo +# +# Usage: +# ./scripts/export-to-public.sh # Sync without tagging +# ./scripts/export-to-public.sh v0.4.0 # Sync and tag for release +# ./scripts/export-to-public.sh --dry-run # Show what would be synced + +set -euo pipefail + +PRIVATE_REPO="$(cd "$(dirname "$0")/.." && pwd)" +PUBLIC_REPO="${PRIVATE_REPO}/../SolarWindPy" +MANIFEST="${PRIVATE_REPO}/.private-manifest" + +# Parse args +DRY_RUN=false +VERSION_TAG="" +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + v*) VERSION_TAG="$arg" ;; + *) echo "Unknown arg: $arg"; exit 1 ;; + esac +done + +# Validate +[ -d "$PUBLIC_REPO/.git" ] || { echo "Error: Public repo not found at $PUBLIC_REPO"; exit 1; } +[ -f "$MANIFEST" ] || { echo "Error: .private-manifest not found"; exit 1; } + +# Build rsync exclude list from manifest +EXCLUDES=() +while IFS= read -r line; do + [[ "$line" =~ ^#.*$ || -z "$line" ]] && continue + EXCLUDES+=(--exclude="$line") +done < "$MANIFEST" + +# Always exclude git directory and build artifacts +EXCLUDES+=(--exclude=".git" --exclude="dist/" --exclude="*.egg-info/" + --exclude="htmlcov/" --exclude="__pycache__/" --exclude=".pytest_cache/" + --exclude=".eggs/" --exclude="build/" --exclude="tmp/" + --exclude="staged-recipes*/" --exclude="*.pyc") + +echo "=== Export to Public Repo ===" +echo "Private: $PRIVATE_REPO" +echo "Public: $PUBLIC_REPO" +echo "Excludes: ${EXCLUDES[*]}" + +if [ "$DRY_RUN" = true ]; then + echo "--- DRY RUN ---" + rsync -avn --delete "${EXCLUDES[@]}" "$PRIVATE_REPO/" "$PUBLIC_REPO/" + exit 0 +fi + +# Sync +rsync -av --delete "${EXCLUDES[@]}" "$PRIVATE_REPO/" "$PUBLIC_REPO/" + +# Commit changes in public repo +cd "$PUBLIC_REPO" +if [ -n "$(git status --porcelain)" ]; then + git add -A + git commit -m "$(cat < +EOF +)" + echo "Committed changes to public repo" +else + echo "No changes to commit" +fi + +# Tag if requested +if [ -n "$VERSION_TAG" ]; then + git tag -a "$VERSION_TAG" -m "SolarWindPy $VERSION_TAG" + echo "Tagged as $VERSION_TAG" + echo "" + echo "Ready to push. Run:" + echo " cd $PUBLIC_REPO && git push origin master --tags" +fi + +echo "=== Export complete ===" +``` + +### Step 3.2: Make executable and commit +```bash +chmod +x scripts/export-to-public.sh +git add scripts/export-to-public.sh +git commit -m "feat(scripts): add export-to-public sync script + +Provides one-command sync from private dev repo to public release +repo, reading .private-manifest for paths to exclude. + +Co-Authored-By: Claude " +``` + +## Phase 4: CI/CD Configuration (~30 min) + +### Private repo CI/CD +No file changes needed. Existing CI/CD workflows work as-is because the private +repo is a superset of the public repo (same structure, same files, same tests). + +### Public repo CI/CD stays as-is +The `publish.yml` workflow already triggers on `v*` tags and publishes to PyPI. +The `docs.yml` workflow already builds documentation. No changes needed. + +### Optional: Disable publishing from private repo +To prevent accidental publishing from the wrong repo, remove the `PYPI_API_TOKEN` +secret from the private repo's GitHub settings. The private repo's `publish.yml` +would fail on tag push (harmless), ensuring you never accidentally publish from +the wrong source. + +**Action**: GitHub Settings (no file changes) -> Private repo -> Settings -> Secrets -> Remove `PYPI_API_TOKEN` + +## Phase 5: Development Workflow (Reference) + +### Daily development +```bash +cd ~/observatories/code/SolarWindPy-dev # Work in private repo +git checkout -b feature/my-feature # Feature branch +# ... develop, test, commit ... +git push origin feature/my-feature # Push to private +# Create PR on private repo (for your own tracking) +``` + +### Research & experiments +```bash +cd ~/observatories/code/SolarWindPy-dev +# Research code goes in research/ -- never synced to public +vim research/notebooks/ion_temperature_analysis.ipynb +vim research/experiments/new_fitting_approach.py +git add research/ +git commit -m "research: preliminary ion temperature analysis" +git push origin master +``` + +### Release workflow +```bash +cd ~/observatories/code/SolarWindPy-dev +# 1. Ensure master is clean and tests pass +pytest -q +# 2. Update CHANGELOG.md +# 3. Dry-run the export +./scripts/export-to-public.sh --dry-run +# 4. Export and tag +./scripts/export-to-public.sh v0.4.0 +# 5. Push to public +cd ~/observatories/code/SolarWindPy +git push origin master --tags +# 6. publish.yml fires -> PyPI -> conda-forge +``` + +### Pulling community contributions (if any) +```bash +cd ~/observatories/code/SolarWindPy-dev +git fetch public +git merge public/master # Pull any external PRs into private +``` + +## Phase 6: Keeping Repos in Sync (Reference) + +### When to sync +- **Always sync before release**: Run export script before tagging +- **Don't sync on every commit**: The public repo receives batch updates at release time +- **Pull from public after external PRs**: If someone contributes to the public repo + +### Conflict prevention +- All development happens in private repo -> public is never ahead of private + (except for external contributions, which are rare for a solo project) +- The export script uses `rsync --delete`, so public repo always matches the + private repo's public-safe content exactly + +### Private content safety +- `.private-manifest` lists all private-only paths +- `rsync --delete` with exclusions ensures private paths never appear in public +- No complex git gymnastics needed -- it's just a file sync + +## Risk Analysis + +| Risk | Severity | Probability | Mitigation | +|------|----------|-------------|------------| +| Forget to sync before release | Medium | Medium | Add to CHANGELOG/release checklist | +| Private content leaks to public | High | Very Low | `.private-manifest` + `rsync` excludes | +| setuptools_scm version mismatch | Medium | Low | Both repos have same tags; verify with `python -m setuptools_scm` | +| Divergent histories after external PR | Low | Low | `git fetch public && git merge public/master` | +| Accidental PyPI publish from private | Medium | Low | Remove PYPI_API_TOKEN from private repo secrets | + +## Cost Analysis (GitHub Organization Question) + +If an org is desired later, these are the costs: +- **Free tier**: Unlimited public + private repos, 2,000 CI min/month. No branch protection on private repos. +- **Team**: $4/user/month. Adds branch protection, required reviewers. +- **Recommendation**: Not needed now. Revisit if the project grows to multiple contributors. + +## Verification Checklist + +After implementation, verify: +- [ ] `git remote -v` in private clone shows both `origin` and `public` +- [ ] `pytest -q` passes in private repo +- [ ] `./scripts/export-to-public.sh --dry-run` shows correct file list +- [ ] Private-only directories (`research/`, `private/`) do NOT appear in dry-run output +- [ ] `python -m setuptools_scm` reports correct version in both repos +- [ ] `.claude/` infrastructure works normally in private repo +- [ ] Public repo CI passes after first sync + +## Critical Files + +- `SolarWindPy/.gitignore` -- needs private-data patterns (in private clone only) +- `SolarWindPy/pyproject.toml` -- URLs stay as-is (public repo) +- `SolarWindPy/.pre-commit-config.yaml` -- works in both repos unchanged +- New: `scripts/export-to-public.sh` -- the sync script (private repo only) +- New: `.private-manifest` -- private content manifest (private repo only) diff --git a/pyproject.toml b/pyproject.toml index 2a4b2e0a..57f3e5fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -103,9 +103,6 @@ dev = [ # Code analysis tools (ast-grep via MCP server, not Python package) "pre-commit>=3.5", # Git hook framework ] -performance = [ - "joblib>=1.3.0", # Parallel execution for TrendFit -] analysis = [ # Interactive analysis environment "jupyterlab>=4.0", diff --git a/solarwindpy/fitfunctions/trend_fits.py b/solarwindpy/fitfunctions/trend_fits.py index 1e389be7..a98326cc 100644 --- a/solarwindpy/fitfunctions/trend_fits.py +++ b/solarwindpy/fitfunctions/trend_fits.py @@ -6,22 +6,12 @@ """ -# import warnings import logging # noqa: F401 -import warnings import numpy as np import pandas as pd import matplotlib as mpl from collections import namedtuple -# Parallel processing support -try: - from joblib import Parallel, delayed - - JOBLIB_AVAILABLE = True -except ImportError: - JOBLIB_AVAILABLE = False - from ..plotting import subplots from . import core from . import gaussians @@ -159,146 +149,24 @@ def make_ffunc1ds(self, **kwargs): ffuncs = pd.Series(ffuncs) self._ffuncs = ffuncs - def make_1dfits(self, n_jobs=1, verbose=0, backend="loky", **kwargs): - r""" - Execute fits for all 1D functions, optionally in parallel. + def make_1dfits(self, **kwargs): + r"""Execute fits for all 1D functions. - Each FitFunction instance represents a single dataset to fit. - TrendFit creates many such instances (one per column), making - this ideal for parallelization. + Removes bad fits from `ffuncs` and saves them in `bad_fits`. Parameters ---------- - n_jobs : int, default=1 - Number of parallel jobs: - - 1: Sequential execution (default, backward compatible) - - -1: Use all available CPU cores - - n>1: Use n cores - Requires joblib: pip install joblib - verbose : int, default=0 - Joblib verbosity level (0=silent, 10=progress) - backend : str, default='loky' - Joblib backend ('loky', 'threading', 'multiprocessing') **kwargs Passed to each FitFunction.make_fit() - - Examples - -------- - >>> # TrendFit creates one FitFunction per column - >>> tf = TrendFit(agg_data, Gaussian, ffunc1d=Gaussian) - >>> tf.make_ffunc1ds() # Creates instances - >>> - >>> # Fit all instances sequentially (default) - >>> tf.make_1dfits() - >>> - >>> # Fit in parallel using all cores - >>> tf.make_1dfits(n_jobs=-1) - >>> - >>> # With progress display - >>> tf.make_1dfits(n_jobs=-1, verbose=10) - - Notes - ----- - Parallel execution returns complete fitted FitFunction objects from worker - processes, which incurs serialization overhead. This overhead typically - outweighs parallelization benefits for simple fits. Parallelization is - most beneficial for: - - - Complex fitting functions with expensive computations - - Large datasets (>1000 points per fit) - - Batch processing of many fits (>50) - - Systems with many CPU cores and sufficient memory - - For typical Gaussian fits on moderate data, sequential execution (n_jobs=1) - may be faster due to Python's GIL and serialization overhead. - - Removes bad fits from `ffuncs` and saves them in `bad_fits`. """ # Successful fits return None, which pandas treats as NaN. return_exception = kwargs.pop("return_exception", True) - # Filter out parallelization parameters from kwargs before passing to make_fit() - # These are specific to make_1dfits() and should not be passed to individual fits - fit_kwargs = { - k: v for k, v in kwargs.items() if k not in ["n_jobs", "verbose", "backend"] - } - - # Check if parallel execution is requested and possible - if n_jobs != 1 and len(self.ffuncs) > 1: - if not JOBLIB_AVAILABLE: - warnings.warn( - f"joblib not installed. Install with 'pip install joblib' " - f"for parallel processing of {len(self.ffuncs)} fits. " - f"Falling back to sequential execution.", - UserWarning, - ) - n_jobs = 1 - else: - # Parallel execution - return fitted objects to preserve TrendFit architecture - def fit_single_from_data( - column_name, x_data, y_data, ffunc_class, ffunc_kwargs - ): - """Create and fit FitFunction, return both result and fitted object.""" - # Create new FitFunction instance in worker process - ffunc = ffunc_class(x_data, y_data, **ffunc_kwargs) - fit_result = ffunc.make_fit( - return_exception=return_exception, **fit_kwargs - ) - # Return tuple: (fit_result, fitted_object) - return (fit_result, ffunc) - - # Prepare minimal data for each fit - fit_tasks = [] - for col_name, ffunc in self.ffuncs.items(): - x_data = ffunc.observations.raw.x - y_data = ffunc.observations.raw.y - ffunc_class = type(ffunc) - # Extract constructor kwargs from ffunc (constraints, etc.) - ffunc_kwargs = { - "xmin": getattr(ffunc, "xmin", None), - "xmax": getattr(ffunc, "xmax", None), - "ymin": getattr(ffunc, "ymin", None), - "ymax": getattr(ffunc, "ymax", None), - "xoutside": getattr(ffunc, "xoutside", None), - "youtside": getattr(ffunc, "youtside", None), - } - # Remove None values - ffunc_kwargs = { - k: v for k, v in ffunc_kwargs.items() if v is not None - } - - fit_tasks.append( - (col_name, x_data, y_data, ffunc_class, ffunc_kwargs) - ) - - # Run fits in parallel and get both results and fitted objects - parallel_output = Parallel( - n_jobs=n_jobs, verbose=verbose, backend=backend - )( - delayed(fit_single_from_data)( - col_name, x_data, y_data, ffunc_class, ffunc_kwargs - ) - for col_name, x_data, y_data, ffunc_class, ffunc_kwargs in fit_tasks - ) - - # Separate results and fitted objects, update self.ffuncs with fitted objects - fit_results = [] - for idx, (result, fitted_ffunc) in enumerate(parallel_output): - fit_results.append(result) - # CRITICAL: Replace original with fitted object to preserve TrendFit architecture - col_name = self.ffuncs.index[idx] - self.ffuncs[col_name] = fitted_ffunc - - # Convert to Series for bad fit handling - fit_success = pd.Series(fit_results, index=self.ffuncs.index) - - if n_jobs == 1: - # Original sequential implementation (unchanged) - fit_success = self.ffuncs.apply( - lambda x: x.make_fit(return_exception=return_exception, **fit_kwargs) - ) + fit_success = self.ffuncs.apply( + lambda x: x.make_fit(return_exception=return_exception, **kwargs) + ) - # Handle failed fits (original code, unchanged) + # Handle failed fits bad_idx = fit_success.dropna().index bad_fits = self.ffuncs.loc[bad_idx] self._bad_fits = bad_fits diff --git a/tests/fitfunctions/test_trend_fits_advanced.py b/tests/fitfunctions/test_trend_fits_advanced.py index 3e42b31c..1baf9708 100644 --- a/tests/fitfunctions/test_trend_fits_advanced.py +++ b/tests/fitfunctions/test_trend_fits_advanced.py @@ -1,14 +1,12 @@ -"""Test Phase 4 performance optimizations.""" +"""Test TrendFit advanced features.""" import time -import warnings import matplotlib import matplotlib.pyplot as plt import numpy as np import pandas as pd import pytest -from unittest.mock import patch from solarwindpy.fitfunctions import Gaussian, Line from solarwindpy.fitfunctions.trend_fits import TrendFit @@ -16,219 +14,6 @@ matplotlib.use("Agg") # Non-interactive backend for testing -class TestTrendFitParallelization: - """Test TrendFit parallel execution.""" - - def setup_method(self): - """Create test data for reproducible tests.""" - np.random.seed(42) - x = np.linspace(0, 10, 50) - self.data = pd.DataFrame( - { - f"col_{i}": 5 * np.exp(-((x - 5) ** 2) / 2) - + np.random.normal(0, 0.1, 50) - for i in range(10) - }, - index=x, - ) - - def test_backward_compatibility(self): - """Verify default behavior unchanged.""" - tf = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf.make_ffunc1ds() - - # Should work without n_jobs parameter (default behavior) - tf.make_1dfits() - assert len(tf.ffuncs) > 0 - assert hasattr(tf, "_bad_fits") - - def test_parallel_sequential_equivalence(self): - """Verify parallel gives same results as sequential.""" - # Sequential execution - tf_seq = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf_seq.make_ffunc1ds() - tf_seq.make_1dfits(n_jobs=1) - - # Parallel execution - tf_par = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf_par.make_ffunc1ds() - tf_par.make_1dfits(n_jobs=2) - - # Should have same number of successful fits - assert len(tf_seq.ffuncs) == len(tf_par.ffuncs) - - # Compare all fit parameters - for key in tf_seq.ffuncs.index: - assert ( - key in tf_par.ffuncs.index - ), f"Fit {key} missing from parallel results" - - seq_popt = tf_seq.ffuncs[key].popt - par_popt = tf_par.ffuncs[key].popt - - # Parameters should match within numerical precision - for param in seq_popt: - np.testing.assert_allclose( - seq_popt[param], - par_popt[param], - rtol=1e-10, - atol=1e-10, - err_msg=f"Parameter {param} differs between sequential and parallel", - ) - - def test_parallel_execution_correctness(self): - """Verify parallel execution works correctly, acknowledging Python GIL limitations.""" - # Check if joblib is available - if not, test falls back gracefully - try: - import joblib # noqa: F401 - - joblib_available = True - except ImportError: - joblib_available = False - - # Create test dataset - focus on correctness rather than performance - x = np.linspace(0, 10, 100) - data = pd.DataFrame( - { - f"col_{i}": 5 * np.exp(-((x - 5) ** 2) / 2) - + np.random.normal(0, 0.1, 100) - for i in range(20) # Reasonable number of fits - }, - index=x, - ) - - # Time sequential execution - tf_seq = TrendFit(data, Gaussian, ffunc1d=Gaussian) - tf_seq.make_ffunc1ds() - start = time.perf_counter() - tf_seq.make_1dfits(n_jobs=1) - seq_time = time.perf_counter() - start - - # Time parallel execution with threading - tf_par = TrendFit(data, Gaussian, ffunc1d=Gaussian) - tf_par.make_ffunc1ds() - start = time.perf_counter() - tf_par.make_1dfits(n_jobs=4, backend="threading") - par_time = time.perf_counter() - start - - speedup = seq_time / par_time if par_time > 0 else float("inf") - - print( - f"Sequential time: {seq_time:.3f}s, fits: {len(tf_seq.ffuncs)}" # noqa: E231 - ) - print( - f"Parallel time: {par_time:.3f}s, fits: {len(tf_par.ffuncs)}" # noqa: E231 - ) - print( - f"Speedup achieved: {speedup:.2f}x (joblib available: {joblib_available})" # noqa: E231 - ) - - if joblib_available: - # Main goal: verify parallel execution works and produces correct results - # Note: Due to Python GIL and serialization overhead, speedup may be minimal - # or even negative for small/fast workloads. This is expected behavior. - assert ( - speedup > 0.05 - ), f"Parallel execution extremely slow, got {speedup:.2f}x" # noqa: E231 - print( - "NOTE: Python GIL and serialization overhead may limit speedup for small workloads" - ) - else: - # Without joblib, both should be sequential (speedup ~1.0) - # Widen tolerance to 1.5 for timing variability across platforms - assert ( - 0.5 <= speedup <= 1.5 - ), f"Expected ~1.0x speedup without joblib, got {speedup:.2f}x" # noqa: E231 - - # Most important: verify both produce the same number of successful fits - assert len(tf_seq.ffuncs) == len( - tf_par.ffuncs - ), "Parallel and sequential should have same success rate" - - # Verify results are equivalent (this is the key correctness test) - for key in tf_seq.ffuncs.index: - if key in tf_par.ffuncs.index: # Both succeeded - seq_popt = tf_seq.ffuncs[key].popt - par_popt = tf_par.ffuncs[key].popt - for param in seq_popt: - np.testing.assert_allclose( - seq_popt[param], - par_popt[param], - rtol=1e-10, - atol=1e-10, - err_msg=f"Parameter {param} differs between sequential and parallel", - ) - - def test_joblib_not_installed_fallback(self): - """Test graceful fallback when joblib unavailable.""" - # Mock joblib as unavailable - with patch.dict("sys.modules", {"joblib": None}): - # Force reload to simulate joblib not being installed - import solarwindpy.fitfunctions.trend_fits as tf_module - - # Temporarily mock JOBLIB_AVAILABLE - original_available = tf_module.JOBLIB_AVAILABLE - tf_module.JOBLIB_AVAILABLE = False - - try: - tf = tf_module.TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf.make_ffunc1ds() - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - tf.make_1dfits(n_jobs=-1) # Request parallel - - # Should warn about joblib not being available - assert len(w) == 1 - assert "joblib not installed" in str(w[0].message) - assert "parallel processing" in str(w[0].message) - - # Should still complete successfully with sequential execution - assert len(tf.ffuncs) > 0 - finally: - # Restore original state - tf_module.JOBLIB_AVAILABLE = original_available - - def test_n_jobs_parameter_validation(self): - """Test different n_jobs parameter values.""" - tf = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf.make_ffunc1ds() - - # Test various n_jobs values - for n_jobs in [1, 2, -1]: - tf_test = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf_test.make_ffunc1ds() - tf_test.make_1dfits(n_jobs=n_jobs) - assert len(tf_test.ffuncs) > 0, f"n_jobs={n_jobs} failed" - - def test_verbose_parameter(self): - """Test verbose parameter doesn't break execution.""" - tf = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf.make_ffunc1ds() - - # Should work with verbose output (though we can't easily test the output) - tf.make_1dfits(n_jobs=2, verbose=0) - assert len(tf.ffuncs) > 0 - - def test_backend_parameter(self): - """Test different joblib backends.""" - tf = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf.make_ffunc1ds() - - # Test different backends (may not all be available in all environments) - for backend in ["loky", "threading"]: - tf_test = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf_test.make_ffunc1ds() - try: - tf_test.make_1dfits(n_jobs=2, backend=backend) - assert len(tf_test.ffuncs) > 0, f"Backend {backend} failed" - except ValueError: - # Some backends may not be available in all environments - pytest.skip( - f"Backend {backend} not available in this environment" # noqa: E713 - ) - - class TestResidualsEnhancement: """Test residuals use_all parameter.""" @@ -400,7 +185,7 @@ def test_complete_workflow(self): * np.exp(-((x - (10 + i * 0.2)) ** 2) / (2 * (2 + i * 0.1) ** 2)) + np.random.normal(0, 0.05, 200) ) - for i in range(25) # 25 measurements for good parallelization test + for i in range(25) }, index=x, ) @@ -409,9 +194,8 @@ def test_complete_workflow(self): tf = TrendFit(data, Gaussian, ffunc1d=Gaussian) tf.make_ffunc1ds() - # Fit with parallelization start_time = time.perf_counter() - tf.make_1dfits(n_jobs=-1, verbose=0) + tf.make_1dfits() execution_time = time.perf_counter() - start_time # Verify results From e05d0f029ce0e2f4c954e591c29d2f9ff11e69b4 Mon Sep 17 00:00:00 2001 From: blalterman Date: Tue, 17 Feb 2026 19:55:44 +0000 Subject: [PATCH 2/2] chore: auto-sync lockfiles from pyproject.toml - Updated requirements.txt (production dependencies) - Updated requirements-dev.lock (development dependencies) - Updated docs/requirements.txt (documentation dependencies) - Updated conda environment: solarwindpy.yml - Auto-generated via pip-compile from pyproject.toml --- docs/requirements.txt | 2 +- requirements-dev.lock | 2 +- solarwindpy.yml | 65 ------------------------------------------- 3 files changed, 2 insertions(+), 67 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 3f476075..7fd96240 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --allow-unsafe --extra=docs --output-file=docs/requirements.txt pyproject.toml diff --git a/requirements-dev.lock b/requirements-dev.lock index 4a7e9d05..3a4ff15c 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --allow-unsafe --extra=dev --output-file=requirements-dev.lock pyproject.toml diff --git a/solarwindpy.yml b/solarwindpy.yml index 16c50efe..1dd1dadb 100644 --- a/solarwindpy.yml +++ b/solarwindpy.yml @@ -22,94 +22,29 @@ name: solarwindpy channels: - conda-forge dependencies: -- alabaster - astropy - astropy-iers-data -- babel -- black -- python-blosc2 - bottleneck -- certifi -- cfgv -- charset-normalizer -- click - contourpy -- coverage[toml] - cycler -- distlib -- doc8 - docstring-inheritance -- docutils -- filelock -- flake8 -- flake8-docstrings - fonttools - h5py -- identify -- idna -- imagesize -- iniconfig -- jinja2 - kiwisolver -- latexcodec - llvmlite -- markupsafe - matplotlib -- mccabe -- msgpack-python -- mypy_extensions -- ndindex -- nodeenv - numba - numexpr - numpy -- numpydoc - packaging - pandas -- pathspec - pillow -- platformdirs -- pluggy -- pre-commit -- psutil -- py-cpuinfo -- pybtex -- pybtex-docutils -- pycodestyle -- pydocstyle -- pyenchant - pyerfa -- pyflakes -- pygments - pyparsing -- pytest -- pytest-cov - python-dateutil -- pytokens - pytz - pyyaml -- requests -- restructuredtext_lint -- roman-numerals -- roman-numerals-py - scipy - six -- snowballstemmer -- sphinx -- sphinx-rtd-theme -- sphinxcontrib-applehelp -- sphinxcontrib-bibtex -- sphinxcontrib-devhelp -- sphinxcontrib-htmlhelp -- sphinxcontrib-jquery -- sphinxcontrib-jsmath -- sphinxcontrib-qthelp -- sphinxcontrib-serializinghtml -- sphinxcontrib-spelling -- stevedore -- pytables - tabulate -- typing-extensions - tzdata -- urllib3 -- virtualenv