diff --git a/.github/workflows/check-pr-label.yml b/.github/workflows/check-pr-label.yml new file mode 100644 index 0000000..b7ea439 --- /dev/null +++ b/.github/workflows/check-pr-label.yml @@ -0,0 +1,21 @@ +name: Check Label on PR +on: + pull_request: + types: [opened, synchronize, labeled, unlabeled] + +jobs: + check_pr_label: + runs-on: ubuntu-latest + steps: + - name: Checkout PR + uses: actions/checkout@v6 + with: + fetch-depth: 0 # Fetch all history for all branches and tags + + - name: Check if the PR contains the label validation or no validation + id: check_pr_label + if: | + ! contains( github.event.pull_request.labels.*.name, 'validation') && ! contains( github.event.pull_request.labels.*.name, 'no validation') + run: | + echo "Neither 'validation' nor 'no validation' labels are present." + exit 1 # Exit with a failure \ No newline at end of file diff --git a/.github/workflows/check-version.yml b/.github/workflows/check-version.yml new file mode 100644 index 0000000..88cce6f --- /dev/null +++ b/.github/workflows/check-version.yml @@ -0,0 +1,31 @@ +name: Check version update +on: + pull_request: + branches: [ "master" ] +permissions: + contents: read +jobs: + check-version: + runs-on: ubuntu-latest + steps: + - name: Checkout PR + uses: actions/checkout@v6 + with: + fetch-depth: 0 # Fetch all history for all branches and tags + - name: Check version update + run: | + PR_NUMBER=${{ github.event.pull_request.number }} + FILE_CHANGED=$(git diff --name-only ${{ github.event.pull_request.base.sha }} HEAD | grep 'pyproject.toml' || true) + if [ -z "$FILE_CHANGED" ]; then + echo "pyproject.toml was not changed in this PR. Please update the version." + exit 1 + else + echo "pyproject.toml was changed in this PR." + fi + VERSION_CHANGED=$(git diff ${{ github.event.pull_request.base.sha }} HEAD | grep 'version =' || true) + if [ -z "$VERSION_CHANGED" ]; then + echo "Version in pyproject.toml was not updated. Please update the version." + exit 1 + else + echo "Version in pyproject.toml was updated." + fi \ No newline at end of file diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml new file mode 100644 index 0000000..c082a48 --- /dev/null +++ b/.github/workflows/lint-code.yml @@ -0,0 +1,58 @@ +name: Lint code +on: [push, pull_request] + +jobs: + # Use ruff to check for code style violations + ruff-check: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v6 + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.14" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ruff + - name: ruff --> Check for style violations + # Configured in pyproject.toml + run: ruff check . + + # Use ruff to check code formatting + ruff-format: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v6 + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.14" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ruff + - name: ruff --> Check code formatting + run: ruff format --check . + + # Use pip-check-reqs/pip-missing-reqs to check for missing dependencies + requirements-check: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v6 + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.14" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pip-check-reqs + + - name: Run pip-check-reqs/pip-missing-reqs + run: | + pip-missing-reqs . diff --git a/.github/workflows/python-app.yml b/.github/workflows/test-code.yml similarity index 58% rename from .github/workflows/python-app.yml rename to .github/workflows/test-code.yml index 2ed96a5..8bfcb4f 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/test-code.yml @@ -18,22 +18,22 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python 3.14 - uses: actions/setup-python@v3 + uses: actions/setup-python@v6 with: python-version: "3.14" - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8 pytest + pip install pytest pytest-cov if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Install dataflow_transfer + run: pip install -e . - name: Test with pytest run: | - pytest + pytest --cov --cov-branch --cov-report=xml + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} \ No newline at end of file diff --git a/dataflow_transfer/cli.py b/dataflow_transfer/cli.py index 57c90a6..b5c7688 100644 --- a/dataflow_transfer/cli.py +++ b/dataflow_transfer/cli.py @@ -1,17 +1,18 @@ -import click -import os import logging +import os + +import click import yaml -from dataflow_transfer.dataflow_transfer import transfer_runs from dataflow_transfer import log +from dataflow_transfer.dataflow_transfer import transfer_runs from dataflow_transfer.run_classes.registry import RUN_CLASS_REGISTRY logger = logging.getLogger(__name__) def load_config(config_file_path): - with open(config_file_path, "r") as file: + with open(config_file_path) as file: config = yaml.safe_load(file) return config diff --git a/dataflow_transfer/dataflow_transfer.py b/dataflow_transfer/dataflow_transfer.py index e696ff8..3b34bb7 100644 --- a/dataflow_transfer/dataflow_transfer.py +++ b/dataflow_transfer/dataflow_transfer.py @@ -2,7 +2,7 @@ import time from dataflow_transfer.run_classes.registry import RUN_CLASS_REGISTRY -from dataflow_transfer.utils.filesystem import get_run_dir, find_runs +from dataflow_transfer.utils.filesystem import find_runs, get_run_dir logger = logging.getLogger(__name__) @@ -30,7 +30,7 @@ def process_run(run_dir, sequencer, config): ## Sequencing ongoing. Start background transfer if not already running. if run.sequencing_ongoing: run.update_statusdb(status="sequencing_started") - run.initiate_background_transfer() + run.start_transfer(final=False) return ## Sequencing finished but transfer not complete. Start final transfer. @@ -41,7 +41,7 @@ def process_run(run_dir, sequencer, config): "Will attempt final transfer again." ) run.update_statusdb(status="sequencing_finished") - run.do_final_transfer() + run.start_transfer(final=True) return ## Final transfer completed successfully. Update statusdb. diff --git a/dataflow_transfer/run_classes/__init__.py b/dataflow_transfer/run_classes/__init__.py index c58da30..705c5c7 100644 --- a/dataflow_transfer/run_classes/__init__.py +++ b/dataflow_transfer/run_classes/__init__.py @@ -1,12 +1,11 @@ # This adds the run classes to the registry. Do not remove. -from .registry import RUN_CLASS_REGISTRY - - +from dataflow_transfer.run_classes.element_runs import AVITIRun # noqa: F401, I001 from dataflow_transfer.run_classes.illumina_runs import ( - NovaSeqXPlusRun, - NextSeqRun, - MiSeqRun, + MiSeqRun, # noqa: F401 + NextSeqRun, # noqa: F401 + NovaSeqXPlusRun, # noqa: F401 ) -from dataflow_transfer.run_classes.ont_runs import PromethIONRun, MinIONRun -from dataflow_transfer.run_classes.element_runs import AVITIRun +from dataflow_transfer.run_classes.ont_runs import MinIONRun, PromethIONRun # noqa: F401 + +from .registry import RUN_CLASS_REGISTRY # noqa: F401 diff --git a/dataflow_transfer/run_classes/element_runs.py b/dataflow_transfer/run_classes/element_runs.py index f00fac7..97f448e 100644 --- a/dataflow_transfer/run_classes/element_runs.py +++ b/dataflow_transfer/run_classes/element_runs.py @@ -1,4 +1,5 @@ from dataflow_transfer.run_classes.generic_runs import Run + from .registry import register_run_class diff --git a/dataflow_transfer/run_classes/generic_runs.py b/dataflow_transfer/run_classes/generic_runs.py index 5d7959d..d76d0b4 100644 --- a/dataflow_transfer/run_classes/generic_runs.py +++ b/dataflow_transfer/run_classes/generic_runs.py @@ -1,9 +1,10 @@ -import os import logging +import os import re from datetime import datetime -from dataflow_transfer.utils.statusdb import StatusdbSession + import dataflow_transfer.utils.filesystem as fs +from dataflow_transfer.utils.statusdb import StatusdbSession logger = logging.getLogger(__name__) @@ -65,53 +66,33 @@ def generate_rsync_command(self, is_final_sync=False): command_str += f"; echo $? > {self.final_rsync_exitcode_file}" return command_str - def initiate_background_transfer(self): + def start_transfer(self, final=False): """Start background rsync transfer to storage.""" - background_transfer_command = self.generate_rsync_command(is_final_sync=False) + transfer_command = self.generate_rsync_command(is_final_sync=final) if fs.rsync_is_running(src=self.run_dir): logger.info( f"Rsync is already running for {self.run_dir}. Skipping background transfer initiation." ) return try: - fs.submit_background_process(background_transfer_command) + fs.submit_background_process(transfer_command) logger.info( - f"{self.run_id}: Started background rsync to {self.miarka_destination}" - + f" with the following command: '{background_transfer_command}'" + f"{self.run_id}: Started rsync to {self.miarka_destination}" + + f" with the following command: '{transfer_command}'" ) except Exception as e: - logger.error(f"Failed to start background transfer for {self.run_id}: {e}") + logger.error(f"Failed to start rsync for {self.run_id}: {e}") raise e rsync_info = { - "command": background_transfer_command, + "command": transfer_command, "destination_path": self.miarka_destination, } - self.update_statusdb(status="transfer_started", additional_info=rsync_info) - - def do_final_transfer(self): - """Start final rsync transfer to storage.""" - final_transfer_command = self.generate_rsync_command(is_final_sync=True) - if fs.rsync_is_running(src=self.run_dir): - logger.info( - f"Rsync is already running for {self.run_dir}. Skipping final transfer initiation." - ) - return - try: - fs.submit_background_process(final_transfer_command) - logger.info( - f"{self.run_id}: Started FINAL rsync to {self.miarka_destination}" - + f" with the following command: '{final_transfer_command}'" + if final: + self.update_statusdb( + status="final_transfer_started", additional_info=rsync_info ) - except Exception as e: - logger.error(f"Failed to start final transfer for {self.run_id}: {e}") - raise e - rsync_info = { - "command": final_transfer_command, - "destination_path": self.miarka_destination, - } - self.update_statusdb( - status="final_transfer_started", additional_info=rsync_info - ) + else: + self.update_statusdb(status="transfer_started", additional_info=rsync_info) @property def final_sync_successful(self): diff --git a/dataflow_transfer/run_classes/illumina_runs.py b/dataflow_transfer/run_classes/illumina_runs.py index b5004a7..b74bb4c 100644 --- a/dataflow_transfer/run_classes/illumina_runs.py +++ b/dataflow_transfer/run_classes/illumina_runs.py @@ -1,4 +1,5 @@ from dataflow_transfer.run_classes.generic_runs import Run + from .registry import register_run_class diff --git a/dataflow_transfer/run_classes/ont_runs.py b/dataflow_transfer/run_classes/ont_runs.py index 1bd730b..dfd11d7 100644 --- a/dataflow_transfer/run_classes/ont_runs.py +++ b/dataflow_transfer/run_classes/ont_runs.py @@ -1,4 +1,5 @@ from dataflow_transfer.run_classes.generic_runs import Run + from .registry import register_run_class diff --git a/dataflow_transfer/tests/test_filesystem.py b/dataflow_transfer/tests/test_filesystem.py index 7bee1e7..2812b82 100644 --- a/dataflow_transfer/tests/test_filesystem.py +++ b/dataflow_transfer/tests/test_filesystem.py @@ -1,18 +1,19 @@ import json import os import tempfile -import pytest -from unittest.mock import patch from subprocess import CalledProcessError +from unittest.mock import patch + +import pytest from dataflow_transfer.utils.filesystem import ( - get_run_dir, + check_exit_status, find_runs, + get_run_dir, + locate_metadata, + parse_metadata_files, rsync_is_running, submit_background_process, - parse_metadata_files, - check_exit_status, - locate_metadata, ) diff --git a/dataflow_transfer/tests/test_run_classes.py b/dataflow_transfer/tests/test_run_classes.py index 3764333..8f46a92 100644 --- a/dataflow_transfer/tests/test_run_classes.py +++ b/dataflow_transfer/tests/test_run_classes.py @@ -1,8 +1,36 @@ import os + import pytest +from dataflow_transfer.run_classes import generic_runs, illumina_runs -from dataflow_transfer.run_classes import illumina_runs, generic_runs +# TODO: add tests for ONT and ELEMENT runs when those are implemented + + +@pytest.fixture +def novaseqxplus_testobj(tmp_path): + config = { + "log": {"file": "test.log"}, + "transfer_details": {"user": "testuser", "host": "testhost"}, + "statusdb": { + "username": "dbuser", + "password": "dbpass", + "url:": "dburl", + "database": "dbname", + }, + "sequencers": { + "NovaSeqXPlus": { + "miarka_destination": "/data/NovaSeqXPlus", + "metadata_for_statusdb": ["RunInfo.xml", "RunParameters.xml"], + "ignore_folders": ["nosync"], + "rsync_options": ["--chmod=Dg+s,g+rw"], + } + }, + } + run_id = "20251010_LH00202_0284_B22CVHTLT1" + run_dir = tmp_path / run_id + run_dir.mkdir() + return illumina_runs.NovaSeqXPlusRun(str(run_dir), config) @pytest.fixture @@ -57,7 +85,6 @@ def miseqseq_testobj(tmp_path): return illumina_runs.MiSeqRun(str(run_dir), config) -# mock calls to dataflow_transfer.utils.statusdb.StatusdbSession to avoid actual DB connections @pytest.fixture(autouse=True) def mock_statusdbsession(monkeypatch): class MockStatusdbSession: @@ -73,10 +100,10 @@ def update_db_doc(self, doc): monkeypatch.setattr(generic_runs, "StatusdbSession", MockStatusdbSession) -# use parameterization for the test fixtures to test confirm_run_type @pytest.mark.parametrize( "run_fixture, expected_run_type", [ + ("novaseqxplus_testobj", "NovaSeqXPlus"), ("nextseq_testobj", "NextSeq"), ("miseqseq_testobj", "MiSeq"), ], @@ -94,6 +121,7 @@ def test_confirm_run_type(run_fixture, expected_run_type, request): @pytest.mark.parametrize( "run_fixture", [ + "novaseqxplus_testobj", "nextseq_testobj", "miseqseq_testobj", ], @@ -112,6 +140,8 @@ def test_sequencing_ongoing(run_fixture, request): @pytest.mark.parametrize( "run_fixture, final_sync", [ + ("novaseqxplus_testobj", False), + ("novaseqxplus_testobj", True), ("nextseq_testobj", False), ("nextseq_testobj", True), ("miseqseq_testobj", False), @@ -129,17 +159,62 @@ def test_generate_rsync_command(run_fixture, final_sync, request): assert f"; echo $? > {run_obj.final_rsync_exitcode_file}" in rsync_command -def test_initiate_background_transfer(): - pass # Further tests can be implemented for initiate_background_transfer +@pytest.mark.parametrize( + "run_fixture, rsync_running, final", + [ + ("novaseqxplus_testobj", False, False), + ("novaseqxplus_testobj", True, False), + ("novaseqxplus_testobj", False, True), + ("novaseqxplus_testobj", True, True), + ("nextseq_testobj", False, False), + ("nextseq_testobj", True, False), + ("nextseq_testobj", False, True), + ("nextseq_testobj", True, True), + ("miseqseq_testobj", False, False), + ("miseqseq_testobj", True, False), + ("miseqseq_testobj", False, True), + ("miseqseq_testobj", True, True), + ], +) +def test_start_transfer(run_fixture, rsync_running, final, request, monkeypatch): + run_obj = request.getfixturevalue(run_fixture) + + def mock_rsync_is_running(src): + return rsync_running + + def mock_submit_background_process(command_str): + mock_submit_background_process.called = True + mock_submit_background_process.command_str = command_str + + def mock_update_statusdb(status, additional_info=None): + mock_update_statusdb.called = True + mock_update_statusdb.status = status + + monkeypatch.setattr(generic_runs.fs, "rsync_is_running", mock_rsync_is_running) + monkeypatch.setattr( + generic_runs.fs, "submit_background_process", mock_submit_background_process + ) + monkeypatch.setattr(run_obj, "update_statusdb", mock_update_statusdb) + run_obj.start_transfer(final=final) -def test_do_final_transfer(): - pass # Further tests can be implemented for do_final_transfer + if rsync_running: + assert not hasattr(mock_submit_background_process, "called") + else: + assert hasattr(mock_submit_background_process, "called") + assert "rsync" in mock_submit_background_process.command_str + assert hasattr(mock_update_statusdb, "called") + if final: + assert mock_update_statusdb.status == "final_transfer_started" + else: + assert mock_update_statusdb.status == "transfer_started" @pytest.mark.parametrize( "run_fixture, sync_successful", [ + ("novaseqxplus_testobj", True), + ("novaseqxplus_testobj", False), ("nextseq_testobj", True), ("nextseq_testobj", False), ("miseqseq_testobj", True), @@ -159,10 +234,13 @@ def test_final_sync_successful(run_fixture, sync_successful, request): assert run_obj.final_sync_successful == sync_successful -# use fixtures to test Run.has_status for differernt illumina_runs objects @pytest.mark.parametrize( "run_fixture, status_to_check, expected_result", [ + ("novaseqxplus_testobj", "sequencing_started", False), + ("novaseqxplus_testobj", "sequencing_started", True), + ("novaseqxplus_testobj", "sequencing_finished", False), + ("novaseqxplus_testobj", "sequencing_finished", True), ("nextseq_testobj", "sequencing_started", False), ("nextseq_testobj", "sequencing_started", True), ("nextseq_testobj", "sequencing_finished", False), @@ -185,3 +263,62 @@ def get_events(self, run_id): run_obj.db = MockDB() assert run_obj.has_status(status_to_check) == expected_result + + +@pytest.mark.parametrize( + "run_fixture, existing_statuses, status_to_update", + [ + ( + "nextseq_testobj", + [], + "sequencing_started", + ), + ( + "nextseq_testobj", + [{"event_type": "sequencing_started"}], + "transfer_started", + ), + ( + "miseqseq_testobj", + [], + "sequencing_started", + ), + ( + "miseqseq_testobj", + [{"event_type": "sequencing_started"}], + "transfer_started", + ), + ], +) +def test_update_statusdb( + run_fixture, + existing_statuses, + status_to_update, + request, +): + run_obj = request.getfixturevalue(run_fixture) + + class MockDB: + def __init__(self): + self.updated_doc = None + + def get_db_doc(self, ddoc, view, run_id): + return {"events": existing_statuses, "files": {}} + + def update_db_doc(self, doc): + self.updated_doc = doc + + import dataflow_transfer.utils.filesystem as fs + + def mock_locate_metadata(metadata_list, run_dir): + return [] + + def mock_parse_metadata_files(files): + return {} + + fs.locate_metadata = mock_locate_metadata + fs.parse_metadata_files = mock_parse_metadata_files + mock_db = MockDB() + run_obj.db = mock_db + run_obj.update_statusdb(status=status_to_update) + assert mock_db.updated_doc["events"][-1]["event_type"] == status_to_update diff --git a/dataflow_transfer/utils/filesystem.py b/dataflow_transfer/utils/filesystem.py index e38068f..a02aadb 100644 --- a/dataflow_transfer/utils/filesystem.py +++ b/dataflow_transfer/utils/filesystem.py @@ -1,9 +1,10 @@ import json import logging import os -import xmltodict import subprocess +import xmltodict + logger = logging.getLogger(__name__) @@ -40,9 +41,7 @@ def rsync_is_running(src): def submit_background_process(command_str: str): """Submit a command string as a background process.""" - background_process = subprocess.Popen( - command_str, stdout=subprocess.PIPE, shell=True - ) + subprocess.Popen(command_str, stdout=subprocess.PIPE, shell=True) def parse_metadata_files(files): @@ -52,10 +51,10 @@ def parse_metadata_files(files): for file_path in files: try: if file_path.endswith(".json"): - with open(file_path, "r") as f: + with open(file_path) as f: metadata[os.path.basename(file_path)] = json.load(f) elif file_path.endswith(".xml"): - with open(file_path, "r") as f: + with open(file_path) as f: xml_content = xmltodict.parse( f.read(), attr_prefix="", cdata_key="text" ) @@ -71,10 +70,10 @@ def parse_metadata_files(files): def check_exit_status(file_path): - """Check the exit status from a given file. + """Check the exit status from a given file. Return True if exit code is 0, else False.""" if os.path.exists(file_path): - with open(file_path, "r") as f: + with open(file_path) as f: exit_code = f.read().strip() if exit_code == "0": return True diff --git a/pyproject.toml b/pyproject.toml index 01a528d..157624c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,26 @@ +[tool.ruff.lint] +select = [ + # Ruff default rules + # ------------------------------ + "E4", # pycodestyle Imports + "E7", # pycodestyle Statements + "E9", # pycodestyle Runtime + "F", # Pyflakes + + # Additional Comment + # ------------------------------------------------------ + "I", # isort Best-practice sorting of imports + "UP", # pyupgrade Make sure syntax is up-to-date +] +ignore = [ + "E402", # Module level import not at top of file + "E722", # Do not use bare 'except' + "E741", # Ambiguous variable name +] + [project] name = "dataflow_transfer" -version = "1.0.2" +version = "1.0.3" description = "Script for transferring sequencing data from sequencers to storage" authors = [ { name = "Sara Sjunnebo", email = "sara.sjunnebo@scilifelab.se" }, @@ -20,6 +40,7 @@ dependencies = [ dev = [ "ruff>=0.11.8", "pytest", + "pytest-cov", ] [project.scripts] diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..a615bde --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,4 @@ +-r requirements.txt +ruff>=0.11.8 +pytest +pytest-cov \ No newline at end of file