diff --git a/src/decoupler/op/_hallmark.py b/src/decoupler/op/_hallmark.py index e88f6d6..b71541d 100644 --- a/src/decoupler/op/_hallmark.py +++ b/src/decoupler/op/_hallmark.py @@ -1,3 +1,5 @@ +from concurrent.futures import Future, ThreadPoolExecutor + import pandas as pd from decoupler._docs import docs @@ -11,7 +13,8 @@ def hallmark( organism: str = "human", license: str = "academic", verbose: bool = False, -) -> pd.DataFrame: + as_future: bool = False, +) -> pd.DataFrame | Future: """ Hallmark gene sets :cite:p:`msigdb`. @@ -23,10 +26,13 @@ def hallmark( %(organism)s %(license)s %(verbose)s + future : bool + If True, returns a `Future` to allow asynchronous execution. Returns ------- - Dataframe in long format containing the hallmark gene sets. + Dataframe in long format containing the hallmark gene sets + or a Future that resolves to it. Example ------- @@ -36,17 +42,28 @@ def hallmark( hm = dc.op.hallmark() hm + + # Asynchronous + future = dc.op.hallmark(as_future=True) + hm = future.result() """ - url = "https://static.omnipathdb.org/tables/msigdb-hallmark.tsv.gz" - hm = _download(url, verbose=verbose) - hm = _bytes_to_pandas(hm, sep="\t", compression="gzip") - hm = hm[["geneset", "genesymbol"]] - hm["geneset"] = hm["geneset"].str.replace("HALLMARK_", "") - hm["genesymbol"] = hm["genesymbol"].str.replace("COMPLEX:", "").str.split("_") - hm = hm.explode("genesymbol") - hm = _infer_dtypes(hm) - if organism != "human": - hm = translate(hm, columns=["genesymbol"], target_organism=organism, verbose=verbose) - hm = hm.rename(columns={"geneset": "source", "genesymbol": "target"}) - hm = hm.drop_duplicates(["source", "target"]).reset_index(drop=True) - return hm + + def _task(): + url = "https://static.omnipathdb.org/tables/msigdb-hallmark.tsv.gz" + hm = _download(url, verbose=verbose) + hm = _bytes_to_pandas(hm, sep="\t", compression="gzip") + hm = hm[["geneset", "genesymbol"]] + hm["geneset"] = hm["geneset"].str.replace("HALLMARK_", "") + hm["genesymbol"] = hm["genesymbol"].str.replace("COMPLEX:", "").str.split("_") + hm = hm.explode("genesymbol") + hm = _infer_dtypes(hm) + if organism != "human": + hm = translate(hm, columns=["genesymbol"], target_organism=organism, verbose=verbose) + hm = hm.rename(columns={"geneset": "source", "genesymbol": "target"}) + hm = hm.drop_duplicates(["source", "target"]).reset_index(drop=True) + return hm + + if as_future: + with ThreadPoolExecutor(max_workers=1) as executor: + return executor.submit(_task) + return _task() diff --git a/tests/op/test_hallmark.py b/tests/op/test_hallmark.py index 7ec2ed1..166ba2d 100644 --- a/tests/op/test_hallmark.py +++ b/tests/op/test_hallmark.py @@ -9,3 +9,12 @@ def test_hallmark(): cols = {"source", "target"} assert cols.issubset(hm.columns) assert not hm.duplicated(["source", "target"]).any() + + +def test_hallmark_as_future(): + future = dc.op.hallmark(as_future=True) + hm = future.result() + assert isinstance(hm, pd.DataFrame) + cols = {"source", "target"} + assert cols.issubset(hm.columns) + assert not hm.duplicated(["source", "target"]).any()