Skip to content

Commit 90dc552

Browse files
committed
add multi-fasta inintial probe set generation support
1 parent f0a340a commit 90dc552

5 files changed

Lines changed: 204 additions & 37 deletions

File tree

README.md

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -25,24 +25,6 @@ bash setup/install.sh
2525

2626
## Usage
2727

28-
### Preparation
29-
30-
`pipeline.py` relies on pre-prepared BLASTn databases. To create the required `true_base`, `false_base`, and `contig_table`, you can use the following script:
31-
32-
```bash
33-
bash scripts/generator/prep_db.sh \
34-
-n {DATABASE_NAME} \
35-
-c {CONTIG_NAME} \
36-
-t {TMP_DIR} \
37-
[FASTA]
38-
```
39-
40-
#### Arguments:
41-
- `-n DATABASE_NAME`: Name of the output BLAST database (required).
42-
- `-c CONTIG_TABLE`: Output file to store contig names and their corresponding sequence headers (required).
43-
- `-t TMP_DIR`: Temporary directory for intermediate files (optional, defaults to `./.tmp`).
44-
- `FASTA`: List of input FASTA files (gzipped or uncompressed).
45-
4628
### Generation
4729

4830
PROBEst can be run using the following command:
@@ -59,7 +41,7 @@ python pipeline.py \
5941
**Blastn databases** and **contig table** are results of the ```prep_db.sh```
6042

6143
#### Key arguments:
62-
- `-i INPUT`: Input FASTA file for probe generation.
44+
- `-i INPUT`: Input FASTA file (or directory with fasta / fasta.gz file) for the initial probe setgeneration.
6345
- `-tb TRUE_BASE`: Input BLASTn database path for primer adjusting.
6446
- `-fb FALSE_BASE`: Input BLASTn database path for non-specific testing.
6547
- `-c CONTIG_TABLE`: .tsv table with BLAST database information.
@@ -81,6 +63,25 @@ python test_parameters.py \
8163
-p {JSON}
8264
```
8365

66+
67+
### Manual preparation
68+
69+
`pipeline.py` relies on pre-prepared BLASTn databases. To create the required `true_base`, `false_base`, and `contig_table`, you can use the following script:
70+
71+
```bash
72+
bash scripts/generator/prep_db.sh \
73+
-n {DATABASE_NAME} \
74+
-c {CONTIG_NAME} \
75+
-t {TMP_DIR} \
76+
[FASTA]
77+
```
78+
79+
#### Arguments:
80+
- `-n DATABASE_NAME`: Name of the output BLAST database (required).
81+
- `-c CONTIG_TABLE`: Output file to store contig names and their corresponding sequence headers (required).
82+
- `-t TMP_DIR`: Temporary directory for intermediate files (optional, defaults to `./.tmp`).
83+
- `FASTA`: List of input FASTA files (gzipped or uncompressed).
84+
8485
### Web Application
8586

8687
PROBEst includes a user-friendly web interface for probe generation. The web app provides:

pipeline.py

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import numpy as np
2626
import pandas as pd
2727
import re
28+
import gzip
2829

2930
# Add src directory to path to ensure local modules can be imported
3031
script_dir = os.path.dirname(os.path.abspath(__file__))
@@ -35,7 +36,7 @@
3536
# 0 Imports: absolute import ----
3637
from PROBESt.evolution import mutate_position
3738
from PROBESt.bash_wrappers import uniline_fasta, blastn_function, probe_check_function
38-
from PROBESt.misc import write_stats
39+
from PROBESt.misc import write_stats, collect_fasta_files, process_multiple_inputs
3940
from PROBESt.merge import merge
4041
from PROBESt.args import arguments_parse
4142
from PROBESt.modeling import run_modeling
@@ -82,25 +83,18 @@ def merge_iter(iter: int):
8283
script_path=args.script_path
8384
)
8485

86+
# Collect all FASTA files from inputs
87+
input_fasta_files = collect_fasta_files(args.input)
88+
print(f"Found {len(input_fasta_files)} FASTA file(s) to process")
89+
8590
# Create TMP
8691
os.makedirs(out_dir(0), exist_ok=True)
8792

88-
# Make uniline fasta
89-
uniline_fasta(args, out_dir(0))
90-
print("Input fasta parsed")
91-
92-
# Template generation
93-
# Check if initial_generator argument exists (for backward compatibility)
93+
# Process each input FASTA separately for initial set generation
9494
initial_generator = getattr(args, 'initial_generator', 'primer3')
95+
process_multiple_inputs(args, input_fasta_files, out_dir(0), initial_generator)
9596

96-
if initial_generator == "primer3":
97-
from PROBESt.primer3 import initial_set_generation
98-
initial_set_generation(args, out_dir(0))
99-
elif initial_generator == "oligominer":
100-
from PROBESt.oligominer import initial_set_generation
101-
initial_set_generation(args, out_dir(0))
102-
else:
103-
raise ValueError(f"Unknown initial generator: {initial_generator}")
97+
# Merge the final combined output
10498
merge_iter(0)
10599

106100
# Check if initial set generation produced any probes
@@ -376,5 +370,18 @@ def merge_iter(iter: int):
376370
final_output_fa = args.output + "/output.fa"
377371

378372
# 8. Modeling and visualization ----
373+
# Create combined input FASTA for modeling (from all input files)
374+
combined_input_fa = os.path.join(args.output, ".tmp", "combined_input.fa")
375+
os.makedirs(os.path.dirname(combined_input_fa), exist_ok=True)
376+
with open(combined_input_fa, 'w') as outfile:
377+
for fasta_file in input_fasta_files:
378+
# Handle gzipped files
379+
if fasta_file.endswith('.gz'):
380+
with gzip.open(fasta_file, 'rt') as infile:
381+
outfile.write(infile.read())
382+
else:
383+
with open(fasta_file, 'r') as infile:
384+
outfile.write(infile.read())
385+
379386
modeling_output = os.path.join(args.output, "modeling_results.tsv")
380-
run_modeling(args, args.input, final_output_fa, modeling_output)
387+
run_modeling(args, combined_input_fa, final_output_fa, modeling_output)

setup/test_generator.sh

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,24 @@ test2(){
119119
--oligominer_path "$OLIGOMINER_DIR"
120120
}
121121

122+
# Test multifasta input
123+
test3 () {
124+
gzip -d data/test/general/fasta_base/true_base/*.gz
125+
python pipeline.py \
126+
-i data/test/general/fasta_base/true_base \
127+
-o data/test/general/output_multifasta \
128+
-tb data/test/general/fasta_base/true_base \
129+
-fb data/test/general/fasta_base/false_base_1 \
130+
data/test/general/fasta_base/false_base_2 \
131+
-a FISH \
132+
--PRIMER_PICK_PRIMER 1 \
133+
--PRIMER_NUM_RETURN 1
134+
}
122135

123136

124137
# Execute the preparation function
125138
clear_previous_test
126139
prepare_blast
127140
test1
128-
test2
141+
test2
142+
test3

src/PROBESt/args.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ def arguments_parse():
3232
# Main arguments
3333
parser.add_argument("-i", "--input",
3434
required=True,
35-
help="Input FASTA file for probe generation. Probes are generated for different contigs separately. Only gene-coding regions are recommended (.fna).")
35+
nargs="*",
36+
help="Input FASTA file(s) or directory(ies) for probe generation. Can be a single file/directory or multiple files/directories. If a directory is provided, all *.fa, *.fna, *.fasta files (and their .gz versions) will be processed. Probes are generated for different contigs separately. Only gene-coding regions are recommended (.fna).")
3637

3738
parser.add_argument("-tb", "--true_base",
3839
required=True,

src/PROBESt/misc.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import os
22
import re
33
import subprocess
4+
import glob
5+
from typing import List
46

57

68
def write_stats(stats: dict, output_dir: str):
@@ -83,3 +85,145 @@ def pairing(x):
8385

8486
# Return a list with the base name appended by "RIGHT" and "LEFT"
8587
return [clear_primer + "RIGHT", clear_primer + "LEFT"]
88+
89+
90+
def collect_fasta_files(inputs: List[str]) -> List[str]:
91+
"""
92+
Collect all FASTA files from input paths (files or directories).
93+
94+
Args:
95+
inputs: List of input paths. Each can be:
96+
- A FASTA file (*.fa, *.fna, *.fasta or their .gz versions)
97+
- A directory containing FASTA files
98+
99+
Returns:
100+
List of paths to all FASTA files found.
101+
102+
Raises:
103+
ValueError: If an input path doesn't exist or no FASTA files are found.
104+
"""
105+
fasta_extensions = ['.fa', '.fna', '.fasta', '.fa.gz', '.fna.gz', '.fasta.gz']
106+
fasta_files = []
107+
108+
for input_path in inputs:
109+
if not os.path.exists(input_path):
110+
raise ValueError(f"Input path does not exist: {input_path}")
111+
112+
if os.path.isfile(input_path):
113+
# Check if it's a FASTA file
114+
if any(input_path.endswith(ext) for ext in fasta_extensions):
115+
fasta_files.append(input_path)
116+
else:
117+
raise ValueError(f"Input file is not a recognized FASTA file: {input_path}")
118+
elif os.path.isdir(input_path):
119+
# Collect all FASTA files from directory
120+
dir_files = []
121+
for ext in fasta_extensions:
122+
pattern = os.path.join(input_path, f"*{ext}")
123+
dir_files.extend(glob.glob(pattern))
124+
125+
if not dir_files:
126+
raise ValueError(f"No FASTA files found in directory: {input_path}")
127+
128+
fasta_files.extend(sorted(dir_files))
129+
else:
130+
raise ValueError(f"Input path is neither a file nor a directory: {input_path}")
131+
132+
if not fasta_files:
133+
raise ValueError("No FASTA files found in any of the provided inputs")
134+
135+
return sorted(fasta_files)
136+
137+
138+
def process_multiple_inputs(
139+
args: 'Namespace',
140+
input_fasta_files: List[str],
141+
output_dir: str,
142+
initial_generator: str = 'primer3'
143+
) -> str:
144+
"""
145+
Process multiple input FASTA files separately for initial set generation and merge their outputs.
146+
147+
This function processes each input FASTA file independently:
148+
1. Creates a temporary directory for each input
149+
2. Runs uniline_fasta and initial_set_generation for each
150+
3. Merges individual outputs
151+
4. Combines all outputs into a single output.fa file
152+
153+
Args:
154+
args: Arguments object containing all pipeline parameters
155+
input_fasta_files: List of paths to FASTA files to process
156+
output_dir: Output directory path (e.g., .tmp/0/)
157+
initial_generator: Initial generator to use ('primer3' or 'oligominer')
158+
159+
Returns:
160+
Path to the final merged output.fa file
161+
162+
Raises:
163+
ValueError: If initial_generator is unknown
164+
ImportError: If required modules cannot be imported
165+
"""
166+
# Import here to avoid circular dependencies
167+
from PROBESt.bash_wrappers import uniline_fasta
168+
from PROBESt.merge import merge
169+
170+
# Create output directory if it doesn't exist
171+
os.makedirs(output_dir, exist_ok=True)
172+
173+
# Process each input FASTA separately for initial set generation
174+
temp_output_dirs = []
175+
176+
for idx, fasta_file in enumerate(input_fasta_files):
177+
print(f"\nProcessing input {idx + 1}/{len(input_fasta_files)}: {fasta_file}")
178+
179+
# Create temporary directory for this input
180+
temp_dir = os.path.join(output_dir, f"input_{idx}/")
181+
os.makedirs(temp_dir, exist_ok=True)
182+
temp_output_dirs.append(temp_dir)
183+
184+
# Create a temporary args object with this specific input file
185+
temp_args = type('Args', (), {})()
186+
for attr in dir(args):
187+
if not attr.startswith('_'):
188+
setattr(temp_args, attr, getattr(args, attr))
189+
temp_args.input = fasta_file
190+
191+
# Make uniline fasta for this input
192+
uniline_fasta(temp_args, temp_dir)
193+
print(f"Input fasta parsed: {fasta_file}")
194+
195+
# Template generation for this input
196+
if initial_generator == "primer3":
197+
from PROBESt.primer3 import initial_set_generation
198+
initial_set_generation(temp_args, temp_dir)
199+
elif initial_generator == "oligominer":
200+
from PROBESt.oligominer import initial_set_generation
201+
initial_set_generation(temp_args, temp_dir)
202+
else:
203+
raise ValueError(f"Unknown initial generator: {initial_generator}")
204+
205+
# Merge this input's output
206+
merge(algo=temp_args.algorithm,
207+
input=os.path.join(temp_dir, "output.fa"),
208+
output=os.path.join(temp_dir, "merged.fa"),
209+
tmp=os.path.join(temp_dir, "fasta_table.tsv"),
210+
NNN=10,
211+
script_path=temp_args.script_path)
212+
213+
# Merge all outputs to the first directory's output.fa
214+
print("\nMerging all initial set outputs...")
215+
final_output_fa = os.path.join(output_dir, "output.fa")
216+
with open(final_output_fa, 'w') as outfile:
217+
for temp_dir in temp_output_dirs:
218+
merged_fa = os.path.join(temp_dir, "merged.fa")
219+
if os.path.exists(merged_fa) and os.path.getsize(merged_fa) > 0:
220+
with open(merged_fa, 'r') as infile:
221+
outfile.write(infile.read())
222+
else:
223+
# If merged.fa doesn't exist, try output.fa
224+
output_fa = os.path.join(temp_dir, "output.fa")
225+
if os.path.exists(output_fa) and os.path.getsize(output_fa) > 0:
226+
with open(output_fa, 'r') as infile:
227+
outfile.write(infile.read())
228+
229+
return final_output_fa

0 commit comments

Comments
 (0)