From 750e55e664ef98113e25fe03f4b273aac6b335c1 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 14 Aug 2025 09:15:50 +0200 Subject: [PATCH 01/13] Fist edition of Aviti index fixer script --- aviti_index_fixer.py | 62 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 aviti_index_fixer.py diff --git a/aviti_index_fixer.py b/aviti_index_fixer.py new file mode 100644 index 0000000..b30ecb9 --- /dev/null +++ b/aviti_index_fixer.py @@ -0,0 +1,62 @@ +import click as cli +import pandas as pd + +NT_COMPLIMENT = { + 'A': 'T', + 'T': 'A', + 'C': 'G', + 'G': 'C' +} + +def load_manifest(path): + """Load the manifest from the given path.""" + with open(path, 'r') as file: + manifest_content = file.read() + header = manifest_content.split("[SAMPLES]")[0] + samples = manifest_content.split("[SAMPLES]")[1].strip().split("\n") + return header, samples + +def load_sample_dataframe(manifest_data): + """Extract the sample data from the manifest and convert it to a DataFrame.""" + header = manifest_data[0] + sample_rows = manifest_data[1:] + + sample_dicts = [] + for row in sample_rows: + row_dict = dict(zip(header.split(","), row.split(","))) + sample_dicts.append(row_dict) + + samples_info = pd.DataFrame.from_dict(sample_dicts) + return samples_info + +def reverse_complement_index(index): + """Return the reverse complement of a given index.""" + return ''.join(NT_COMPLIMENT[nuc] for nuc in reversed(index)) + + +@cli.command() +@cli.option('--manifest_path', required=True, help='Path to the sample manifest. e.g. ~/fc/AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv') +@cli.option('--project', required=False, help='Project ID, e.g. P10001. Only the indexes of samples with this specific project ID will be changed') +@cli.option('--swap', is_flag=True, help='Swaps index 1 and 2.') +@cli.option('--rc1', is_flag=True, help='Exchanges index 1 for its reverse compliment.') +@cli.option('--rc2', is_flag=True, help='Exchanges index 2 for its reverse compliment.') +# TODO: Add option to include additional samples + +def main(manifest_path, project, swap, rc1, rc2): + """Main function to fix the samplesheet indexes for AVITI runs.""" + # Read the samplesheet + manifest_header, manifest_data = load_manifest(manifest_path) + # Read the sample data into a data frame (look at Element_runs.py for an example) + samples_info = load_sample_dataframe(manifest_data) + # Process the indexes based on the options provided + if rc1: + samples_info['Index1'] = samples_info['Index1'].apply(reverse_complement_index) + if rc2: + samples_info['Index2'] = samples_info['Index2'].apply(reverse_complement_index) + if swap: + samples_info[['Index1', 'Index2']] = samples_info[['Index2', 'Index1']] + print(samples_info) + # Generate the updated samplesheet + +if __name__ == '__main__': + main() From 487cbec4709c6183ba75d9a750e4e306840dc379 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 14 Aug 2025 09:26:32 +0200 Subject: [PATCH 02/13] Filter projects and write output file --- aviti_index_fixer.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/aviti_index_fixer.py b/aviti_index_fixer.py index b30ecb9..b8ef14d 100644 --- a/aviti_index_fixer.py +++ b/aviti_index_fixer.py @@ -46,17 +46,29 @@ def main(manifest_path, project, swap, rc1, rc2): """Main function to fix the samplesheet indexes for AVITI runs.""" # Read the samplesheet manifest_header, manifest_data = load_manifest(manifest_path) + # Read the sample data into a data frame (look at Element_runs.py for an example) samples_info = load_sample_dataframe(manifest_data) + # Process the indexes based on the options provided + if project: + mask = samples_info['SampleName'].apply(lambda x: x.split("_")[0] == project) + else: + mask = pd.Series([True] * len(samples_info)) + if rc1: - samples_info['Index1'] = samples_info['Index1'].apply(reverse_complement_index) + samples_info.loc[mask, 'Index1'] = samples_info.loc[mask, 'Index1'].apply(reverse_complement_index) if rc2: - samples_info['Index2'] = samples_info['Index2'].apply(reverse_complement_index) + samples_info.loc[mask, 'Index2'] = samples_info.loc[mask, 'Index2'].apply(reverse_complement_index) if swap: - samples_info[['Index1', 'Index2']] = samples_info[['Index2', 'Index1']] - print(samples_info) + samples_info.loc[mask, ['Index1', 'Index2']] = samples_info.loc[mask, ['Index2', 'Index1']].values + # Generate the updated samplesheet + updated_samplesheet = manifest_header + "\n[SAMPLES]\n" + samples_info.to_csv(index=False, header=True).strip() + # Write the updated samplesheet to a new file + output_path = manifest_path.replace('.csv', '_updates.csv') + with open(output_path, 'w') as output_file: + output_file.write(updated_samplesheet) if __name__ == '__main__': main() From e5d6ea9cd9497e5cb679edbd4e724f9be39c32aa Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 14 Aug 2025 10:21:22 +0200 Subject: [PATCH 03/13] Sort output and add option to include additional samples --- aviti_index_fixer.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/aviti_index_fixer.py b/aviti_index_fixer.py index b8ef14d..eb72585 100644 --- a/aviti_index_fixer.py +++ b/aviti_index_fixer.py @@ -1,5 +1,6 @@ import click as cli import pandas as pd +import os NT_COMPLIMENT = { 'A': 'T', @@ -33,16 +34,15 @@ def reverse_complement_index(index): """Return the reverse complement of a given index.""" return ''.join(NT_COMPLIMENT[nuc] for nuc in reversed(index)) - @cli.command() @cli.option('--manifest_path', required=True, help='Path to the sample manifest. e.g. ~/fc/AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv') @cli.option('--project', required=False, help='Project ID, e.g. P10001. Only the indexes of samples with this specific project ID will be changed') @cli.option('--swap', is_flag=True, help='Swaps index 1 and 2.') @cli.option('--rc1', is_flag=True, help='Exchanges index 1 for its reverse compliment.') @cli.option('--rc2', is_flag=True, help='Exchanges index 2 for its reverse compliment.') -# TODO: Add option to include additional samples +@cli.option('--add_sample', multiple=True, help='Include additional sample(s). Use multiple times for multiple samples, or provide a file. Each new sample should have the same format as in the existing manifest. Example: --add_sample P12345,ATCG,CGTA,1,A__Project_25_16,301-10-10-301,ATCG-CGTA,') -def main(manifest_path, project, swap, rc1, rc2): +def main(manifest_path, project, swap, rc1, rc2, add_sample): """Main function to fix the samplesheet indexes for AVITI runs.""" # Read the samplesheet manifest_header, manifest_data = load_manifest(manifest_path) @@ -62,11 +62,25 @@ def main(manifest_path, project, swap, rc1, rc2): samples_info.loc[mask, 'Index2'] = samples_info.loc[mask, 'Index2'].apply(reverse_complement_index) if swap: samples_info.loc[mask, ['Index1', 'Index2']] = samples_info.loc[mask, ['Index2', 'Index1']].values - + for additional_sample in add_sample: + if os.path.isfile(additional_sample): + # If a file is provided, read it and append to the samples_info DataFrame + additional_samples = pd.read_csv(additional_sample, header=None) + additional_samples.columns = samples_info.columns + else: + # If a sample is provided directly, create a DataFrame from it + additional_samples = pd.DataFrame([additional_sample.split(',')], columns=samples_info.columns) + samples_info = pd.concat([samples_info, additional_samples], ignore_index=True) + + # Sort the samples by lane and SampleName + samples_info['Lane'] = samples_info['Lane'].astype(int) + samples_info.sort_values(by=['Lane', 'SampleName'], inplace=True) + # Generate the updated samplesheet - updated_samplesheet = manifest_header + "\n[SAMPLES]\n" + samples_info.to_csv(index=False, header=True).strip() + updated_samplesheet = manifest_header + "\n[SAMPLES]\n" + samples_info.to_csv(index=False, header=True) + # Write the updated samplesheet to a new file - output_path = manifest_path.replace('.csv', '_updates.csv') + output_path = manifest_path.replace('.csv', '_updated.csv') with open(output_path, 'w') as output_file: output_file.write(updated_samplesheet) From 9c05440a52ec857d3bc1682970c742d0ce04fe15 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 14 Aug 2025 10:40:15 +0200 Subject: [PATCH 04/13] Also update the lims_label --- aviti_index_fixer.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/aviti_index_fixer.py b/aviti_index_fixer.py index eb72585..f45a752 100644 --- a/aviti_index_fixer.py +++ b/aviti_index_fixer.py @@ -13,9 +13,12 @@ def load_manifest(path): """Load the manifest from the given path.""" with open(path, 'r') as file: manifest_content = file.read() - header = manifest_content.split("[SAMPLES]")[0] - samples = manifest_content.split("[SAMPLES]")[1].strip().split("\n") - return header, samples + header_section = manifest_content.split("[SAMPLES]")[0] + samples_section = manifest_content.split("[SAMPLES]")[1].strip().split("\n") + all_samples = load_sample_dataframe(samples_section) + samples_info = all_samples[all_samples["Project"] != "Control"].copy() + controls_info = all_samples[all_samples["Project"] == "Control"].copy() + return header_section, samples_info, controls_info def load_sample_dataframe(manifest_data): """Extract the sample data from the manifest and convert it to a DataFrame.""" @@ -45,11 +48,8 @@ def reverse_complement_index(index): def main(manifest_path, project, swap, rc1, rc2, add_sample): """Main function to fix the samplesheet indexes for AVITI runs.""" # Read the samplesheet - manifest_header, manifest_data = load_manifest(manifest_path) - - # Read the sample data into a data frame (look at Element_runs.py for an example) - samples_info = load_sample_dataframe(manifest_data) - + manifest_header, samples_info, controls_info = load_manifest(manifest_path) + # Process the indexes based on the options provided if project: mask = samples_info['SampleName'].apply(lambda x: x.split("_")[0] == project) @@ -62,6 +62,9 @@ def main(manifest_path, project, swap, rc1, rc2, add_sample): samples_info.loc[mask, 'Index2'] = samples_info.loc[mask, 'Index2'].apply(reverse_complement_index) if swap: samples_info.loc[mask, ['Index1', 'Index2']] = samples_info.loc[mask, ['Index2', 'Index1']].values + if rc1 or rc2 or swap: + # Update lims_label if any changes were made unless the "Project" column is not "Control" + samples_info.loc[mask, 'lims_label'] = samples_info.loc[mask, 'Index1'] + '-' + samples_info.loc[mask, 'Index2'] for additional_sample in add_sample: if os.path.isfile(additional_sample): # If a file is provided, read it and append to the samples_info DataFrame @@ -77,7 +80,7 @@ def main(manifest_path, project, swap, rc1, rc2, add_sample): samples_info.sort_values(by=['Lane', 'SampleName'], inplace=True) # Generate the updated samplesheet - updated_samplesheet = manifest_header + "\n[SAMPLES]\n" + samples_info.to_csv(index=False, header=True) + updated_samplesheet = manifest_header + "\n[SAMPLES]\n" + samples_info.to_csv(index=False, header=True) + controls_info.to_csv(index=False, header=False) # Write the updated samplesheet to a new file output_path = manifest_path.replace('.csv', '_updated.csv') From cf6c88470091c96d1ed92ca6dd1d9af8b3d97949 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 14 Aug 2025 10:53:24 +0200 Subject: [PATCH 05/13] print progress --- aviti_index_fixer.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/aviti_index_fixer.py b/aviti_index_fixer.py index f45a752..e665502 100644 --- a/aviti_index_fixer.py +++ b/aviti_index_fixer.py @@ -58,10 +58,13 @@ def main(manifest_path, project, swap, rc1, rc2, add_sample): if rc1: samples_info.loc[mask, 'Index1'] = samples_info.loc[mask, 'Index1'].apply(reverse_complement_index) + print("Reverse complementing Index1") if rc2: samples_info.loc[mask, 'Index2'] = samples_info.loc[mask, 'Index2'].apply(reverse_complement_index) + print("Reverse complementing Index2") if swap: samples_info.loc[mask, ['Index1', 'Index2']] = samples_info.loc[mask, ['Index2', 'Index1']].values + print("Swapping Index1 and Index2") if rc1 or rc2 or swap: # Update lims_label if any changes were made unless the "Project" column is not "Control" samples_info.loc[mask, 'lims_label'] = samples_info.loc[mask, 'Index1'] + '-' + samples_info.loc[mask, 'Index2'] @@ -74,6 +77,10 @@ def main(manifest_path, project, swap, rc1, rc2, add_sample): # If a sample is provided directly, create a DataFrame from it additional_samples = pd.DataFrame([additional_sample.split(',')], columns=samples_info.columns) samples_info = pd.concat([samples_info, additional_samples], ignore_index=True) + if len(additional_samples) == 1: + print("Adding additional sample:", additional_samples['SampleName'].tolist()[0]) + else: + print("Adding additional samples:", (", ").join(additional_samples['SampleName'].tolist())) # Sort the samples by lane and SampleName samples_info['Lane'] = samples_info['Lane'].astype(int) From 5f6592d151ba6edbea32758056f9bcf1bbdde8e7 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 14 Aug 2025 11:01:32 +0200 Subject: [PATCH 06/13] Allow for multiple projects to be specified --- aviti_index_fixer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/aviti_index_fixer.py b/aviti_index_fixer.py index e665502..57f52e9 100644 --- a/aviti_index_fixer.py +++ b/aviti_index_fixer.py @@ -17,7 +17,7 @@ def load_manifest(path): samples_section = manifest_content.split("[SAMPLES]")[1].strip().split("\n") all_samples = load_sample_dataframe(samples_section) samples_info = all_samples[all_samples["Project"] != "Control"].copy() - controls_info = all_samples[all_samples["Project"] == "Control"].copy() + controls_info = all_samples[all_samples["Project"] == "Control"].copy() # So that we don't apply any changes to control samples return header_section, samples_info, controls_info def load_sample_dataframe(manifest_data): @@ -39,7 +39,7 @@ def reverse_complement_index(index): @cli.command() @cli.option('--manifest_path', required=True, help='Path to the sample manifest. e.g. ~/fc/AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv') -@cli.option('--project', required=False, help='Project ID, e.g. P10001. Only the indexes of samples with this specific project ID will be changed') +@cli.option('--project', multiple=True, required=False, help='Project ID, e.g. P10001. Only the indexes of samples with this specific project ID will be changed. Use multiple times for multiple projects.') @cli.option('--swap', is_flag=True, help='Swaps index 1 and 2.') @cli.option('--rc1', is_flag=True, help='Exchanges index 1 for its reverse compliment.') @cli.option('--rc2', is_flag=True, help='Exchanges index 2 for its reverse compliment.') @@ -52,7 +52,7 @@ def main(manifest_path, project, swap, rc1, rc2, add_sample): # Process the indexes based on the options provided if project: - mask = samples_info['SampleName'].apply(lambda x: x.split("_")[0] == project) + mask = samples_info['SampleName'].apply(lambda x: x.split("_")[0] in project) else: mask = pd.Series([True] * len(samples_info)) @@ -66,7 +66,7 @@ def main(manifest_path, project, swap, rc1, rc2, add_sample): samples_info.loc[mask, ['Index1', 'Index2']] = samples_info.loc[mask, ['Index2', 'Index1']].values print("Swapping Index1 and Index2") if rc1 or rc2 or swap: - # Update lims_label if any changes were made unless the "Project" column is not "Control" + # Update lims_label if any changes were made samples_info.loc[mask, 'lims_label'] = samples_info.loc[mask, 'Index1'] + '-' + samples_info.loc[mask, 'Index2'] for additional_sample in add_sample: if os.path.isfile(additional_sample): From cc8bdd57b5b05e70ef57a0146a3b5fb154581bd3 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 14 Aug 2025 11:25:01 +0200 Subject: [PATCH 07/13] Cleanup --- aviti_index_fixer.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/aviti_index_fixer.py b/aviti_index_fixer.py index 57f52e9..2ee4c60 100644 --- a/aviti_index_fixer.py +++ b/aviti_index_fixer.py @@ -1,27 +1,30 @@ +import os import click as cli import pandas as pd -import os NT_COMPLIMENT = { 'A': 'T', 'T': 'A', 'C': 'G', - 'G': 'C' + 'G': 'C', } def load_manifest(path): """Load the manifest from the given path.""" with open(path, 'r') as file: manifest_content = file.read() + header_section = manifest_content.split("[SAMPLES]")[0] samples_section = manifest_content.split("[SAMPLES]")[1].strip().split("\n") + all_samples = load_sample_dataframe(samples_section) samples_info = all_samples[all_samples["Project"] != "Control"].copy() controls_info = all_samples[all_samples["Project"] == "Control"].copy() # So that we don't apply any changes to control samples + return header_section, samples_info, controls_info def load_sample_dataframe(manifest_data): - """Extract the sample data from the manifest and convert it to a DataFrame.""" + """Load the sample data into a DataFrame.""" header = manifest_data[0] sample_rows = manifest_data[1:] @@ -47,10 +50,8 @@ def reverse_complement_index(index): def main(manifest_path, project, swap, rc1, rc2, add_sample): """Main function to fix the samplesheet indexes for AVITI runs.""" - # Read the samplesheet manifest_header, samples_info, controls_info = load_manifest(manifest_path) - # Process the indexes based on the options provided if project: mask = samples_info['SampleName'].apply(lambda x: x.split("_")[0] in project) else: @@ -70,11 +71,9 @@ def main(manifest_path, project, swap, rc1, rc2, add_sample): samples_info.loc[mask, 'lims_label'] = samples_info.loc[mask, 'Index1'] + '-' + samples_info.loc[mask, 'Index2'] for additional_sample in add_sample: if os.path.isfile(additional_sample): - # If a file is provided, read it and append to the samples_info DataFrame additional_samples = pd.read_csv(additional_sample, header=None) additional_samples.columns = samples_info.columns else: - # If a sample is provided directly, create a DataFrame from it additional_samples = pd.DataFrame([additional_sample.split(',')], columns=samples_info.columns) samples_info = pd.concat([samples_info, additional_samples], ignore_index=True) if len(additional_samples) == 1: @@ -82,14 +81,10 @@ def main(manifest_path, project, swap, rc1, rc2, add_sample): else: print("Adding additional samples:", (", ").join(additional_samples['SampleName'].tolist())) - # Sort the samples by lane and SampleName samples_info['Lane'] = samples_info['Lane'].astype(int) samples_info.sort_values(by=['Lane', 'SampleName'], inplace=True) - # Generate the updated samplesheet updated_samplesheet = manifest_header + "\n[SAMPLES]\n" + samples_info.to_csv(index=False, header=True) + controls_info.to_csv(index=False, header=False) - - # Write the updated samplesheet to a new file output_path = manifest_path.replace('.csv', '_updated.csv') with open(output_path, 'w') as output_file: output_file.write(updated_samplesheet) From b3078b8a9450a8cb8057920ebac79fa8ba36f87d Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 14 Aug 2025 11:38:50 +0200 Subject: [PATCH 08/13] Add aviti_index_fixer to README --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index ae52508..0dbe420 100755 --- a/README.md +++ b/README.md @@ -2,6 +2,24 @@ Repository to store standalone scripts that do not belong to any bigger package or repository. +### aviti_index_fixer.py +Given a run manifest for Aviti, it generates a new one with the provided command line options. The new manifest will be output in the same place as the old, with the extension '_updated.csv'. + +#### Usage +Examples: +- Reverse complement index 1 and 2 for project P12345 and P67890: +`python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --rc1 --rc2 --project P12345 --project P67890` +- Swap indexes for all samples in the manifest: +`python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --swap` +- Include additional samples to the manifest, specifying the samples on the commandline: +`python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --add_sample "P12345_1097,ATAC,AGGC,1,A__Project_25_16,301-10-10-301,ATAC-AGGC," --add_sample "P12345_1098,GTAC,GGGC,1,A__Project_25_16,301-10-10-301,GTAC-GGGC,"` +- Include additional samples to the manifest, specifying the samples in a csv file (each line in the file needs to have the same format as in the manifest): +`python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --add_sample samples.csv` + +#### Dependencies + +* click +* pandas ### compute_undet_index_stats.py used to fetch stats about undermined indexes. From 598299bbd995833a7a2b7236c9e8d033350f8d39 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 14 Aug 2025 11:40:11 +0200 Subject: [PATCH 09/13] improve code blocks --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0dbe420..44975c4 100755 --- a/README.md +++ b/README.md @@ -8,13 +8,13 @@ Given a run manifest for Aviti, it generates a new one with the provided command #### Usage Examples: - Reverse complement index 1 and 2 for project P12345 and P67890: -`python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --rc1 --rc2 --project P12345 --project P67890` +```python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --rc1 --rc2 --project P12345 --project P67890``` - Swap indexes for all samples in the manifest: -`python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --swap` +```python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --swap``` - Include additional samples to the manifest, specifying the samples on the commandline: -`python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --add_sample "P12345_1097,ATAC,AGGC,1,A__Project_25_16,301-10-10-301,ATAC-AGGC," --add_sample "P12345_1098,GTAC,GGGC,1,A__Project_25_16,301-10-10-301,GTAC-GGGC,"` +```python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --add_sample "P12345_1097,ATAC,AGGC,1,A__Project_25_16,301-10-10-301,ATAC-AGGC," --add_sample "P12345_1098,GTAC,GGGC,1,A__Project_25_16,301-10-10-301,GTAC-GGGC,"``` - Include additional samples to the manifest, specifying the samples in a csv file (each line in the file needs to have the same format as in the manifest): -`python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --add_sample samples.csv` +```python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --add_sample samples.csv``` #### Dependencies From aa480203963b52b332c41a062ea8ec2b90d7d736 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 14 Aug 2025 11:41:07 +0200 Subject: [PATCH 10/13] Improve improved code blocks --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 44975c4..b01204b 100755 --- a/README.md +++ b/README.md @@ -8,12 +8,19 @@ Given a run manifest for Aviti, it generates a new one with the provided command #### Usage Examples: - Reverse complement index 1 and 2 for project P12345 and P67890: + ```python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --rc1 --rc2 --project P12345 --project P67890``` + - Swap indexes for all samples in the manifest: + ```python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --swap``` + - Include additional samples to the manifest, specifying the samples on the commandline: + ```python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --add_sample "P12345_1097,ATAC,AGGC,1,A__Project_25_16,301-10-10-301,ATAC-AGGC," --add_sample "P12345_1098,GTAC,GGGC,1,A__Project_25_16,301-10-10-301,GTAC-GGGC,"``` + - Include additional samples to the manifest, specifying the samples in a csv file (each line in the file needs to have the same format as in the manifest): + ```python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --add_sample samples.csv``` #### Dependencies From 91bc9c03ac42ab4c59f0a66dbfd85fef19dbbf38 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 14 Aug 2025 11:42:18 +0200 Subject: [PATCH 11/13] Code blocks, again... --- README.md | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index b01204b..6fceea3 100755 --- a/README.md +++ b/README.md @@ -8,20 +8,21 @@ Given a run manifest for Aviti, it generates a new one with the provided command #### Usage Examples: - Reverse complement index 1 and 2 for project P12345 and P67890: - -```python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --rc1 --rc2 --project P12345 --project P67890``` - +``` +python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --rc1 --rc2 --project P12345 --project P67890 +``` - Swap indexes for all samples in the manifest: - -```python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --swap``` - +``` +python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --swap +``` - Include additional samples to the manifest, specifying the samples on the commandline: - -```python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --add_sample "P12345_1097,ATAC,AGGC,1,A__Project_25_16,301-10-10-301,ATAC-AGGC," --add_sample "P12345_1098,GTAC,GGGC,1,A__Project_25_16,301-10-10-301,GTAC-GGGC,"``` - +``` +python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --add_sample "P12345_1097,ATAC,AGGC,1,A__Project_25_16,301-10-10-301,ATAC-AGGC," --add_sample "P12345_1098,GTAC,GGGC,1,A__Project_25_16,301-10-10-301,GTAC-GGGC," +``` - Include additional samples to the manifest, specifying the samples in a csv file (each line in the file needs to have the same format as in the manifest): - -```python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --add_sample samples.csv``` +``` +python aviti_index_fixer.py --manifest_path AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv --add_sample samples.csv +``` #### Dependencies From d2e786b648fcea9f2b4c4a204c449a931ecc718e Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 14 Aug 2025 13:20:37 +0200 Subject: [PATCH 12/13] ruff formatting --- aviti_index_fixer.py | 108 +++++++++++++++++++++++++++++-------------- 1 file changed, 74 insertions(+), 34 deletions(-) diff --git a/aviti_index_fixer.py b/aviti_index_fixer.py index 2ee4c60..1a2e085 100644 --- a/aviti_index_fixer.py +++ b/aviti_index_fixer.py @@ -3,26 +3,30 @@ import pandas as pd NT_COMPLIMENT = { - 'A': 'T', - 'T': 'A', - 'C': 'G', - 'G': 'C', + "A": "T", + "T": "A", + "C": "G", + "G": "C", } + def load_manifest(path): """Load the manifest from the given path.""" - with open(path, 'r') as file: + with open(path, "r") as file: manifest_content = file.read() - + header_section = manifest_content.split("[SAMPLES]")[0] samples_section = manifest_content.split("[SAMPLES]")[1].strip().split("\n") - + all_samples = load_sample_dataframe(samples_section) samples_info = all_samples[all_samples["Project"] != "Control"].copy() - controls_info = all_samples[all_samples["Project"] == "Control"].copy() # So that we don't apply any changes to control samples - + controls_info = all_samples[ + all_samples["Project"] == "Control" + ].copy() # So that we don't apply any changes to control samples + return header_section, samples_info, controls_info + def load_sample_dataframe(manifest_data): """Load the sample data into a DataFrame.""" header = manifest_data[0] @@ -36,58 +40,94 @@ def load_sample_dataframe(manifest_data): samples_info = pd.DataFrame.from_dict(sample_dicts) return samples_info + def reverse_complement_index(index): """Return the reverse complement of a given index.""" - return ''.join(NT_COMPLIMENT[nuc] for nuc in reversed(index)) + return "".join(NT_COMPLIMENT[nuc] for nuc in reversed(index)) -@cli.command() -@cli.option('--manifest_path', required=True, help='Path to the sample manifest. e.g. ~/fc/AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv') -@cli.option('--project', multiple=True, required=False, help='Project ID, e.g. P10001. Only the indexes of samples with this specific project ID will be changed. Use multiple times for multiple projects.') -@cli.option('--swap', is_flag=True, help='Swaps index 1 and 2.') -@cli.option('--rc1', is_flag=True, help='Exchanges index 1 for its reverse compliment.') -@cli.option('--rc2', is_flag=True, help='Exchanges index 2 for its reverse compliment.') -@cli.option('--add_sample', multiple=True, help='Include additional sample(s). Use multiple times for multiple samples, or provide a file. Each new sample should have the same format as in the existing manifest. Example: --add_sample P12345,ATCG,CGTA,1,A__Project_25_16,301-10-10-301,ATCG-CGTA,') +@cli.command() +@cli.option( + "--manifest_path", + required=True, + help="Path to the sample manifest. e.g. ~/fc/AVITI_run_manifest_2450545934_24-1214961_250722_154957_EunkyoungChoi_untrimmed.csv", +) +@cli.option( + "--project", + multiple=True, + required=False, + help="Project ID, e.g. P10001. Only the indexes of samples with this specific project ID will be changed. Use multiple times for multiple projects.", +) +@cli.option("--swap", is_flag=True, help="Swaps index 1 and 2.") +@cli.option("--rc1", is_flag=True, help="Exchanges index 1 for its reverse compliment.") +@cli.option("--rc2", is_flag=True, help="Exchanges index 2 for its reverse compliment.") +@cli.option( + "--add_sample", + multiple=True, + help="Include additional sample(s). Use multiple times for multiple samples, or provide a file. Each new sample should have the same format as in the existing manifest. Example: --add_sample P12345,ATCG,CGTA,1,A__Project_25_16,301-10-10-301,ATCG-CGTA,", +) def main(manifest_path, project, swap, rc1, rc2, add_sample): """Main function to fix the samplesheet indexes for AVITI runs.""" - manifest_header, samples_info, controls_info = load_manifest(manifest_path) + manifest_header, samples_info, controls_info = load_manifest(manifest_path) if project: - mask = samples_info['SampleName'].apply(lambda x: x.split("_")[0] in project) + mask = samples_info["SampleName"].apply(lambda x: x.split("_")[0] in project) else: mask = pd.Series([True] * len(samples_info)) if rc1: - samples_info.loc[mask, 'Index1'] = samples_info.loc[mask, 'Index1'].apply(reverse_complement_index) + samples_info.loc[mask, "Index1"] = samples_info.loc[mask, "Index1"].apply( + reverse_complement_index + ) print("Reverse complementing Index1") if rc2: - samples_info.loc[mask, 'Index2'] = samples_info.loc[mask, 'Index2'].apply(reverse_complement_index) + samples_info.loc[mask, "Index2"] = samples_info.loc[mask, "Index2"].apply( + reverse_complement_index + ) print("Reverse complementing Index2") if swap: - samples_info.loc[mask, ['Index1', 'Index2']] = samples_info.loc[mask, ['Index2', 'Index1']].values + samples_info.loc[mask, ["Index1", "Index2"]] = samples_info.loc[ + mask, ["Index2", "Index1"] + ].values print("Swapping Index1 and Index2") if rc1 or rc2 or swap: # Update lims_label if any changes were made - samples_info.loc[mask, 'lims_label'] = samples_info.loc[mask, 'Index1'] + '-' + samples_info.loc[mask, 'Index2'] + samples_info.loc[mask, "lims_label"] = ( + samples_info.loc[mask, "Index1"] + "-" + samples_info.loc[mask, "Index2"] + ) for additional_sample in add_sample: if os.path.isfile(additional_sample): additional_samples = pd.read_csv(additional_sample, header=None) additional_samples.columns = samples_info.columns else: - additional_samples = pd.DataFrame([additional_sample.split(',')], columns=samples_info.columns) + additional_samples = pd.DataFrame( + [additional_sample.split(",")], columns=samples_info.columns + ) samples_info = pd.concat([samples_info, additional_samples], ignore_index=True) if len(additional_samples) == 1: - print("Adding additional sample:", additional_samples['SampleName'].tolist()[0]) + print( + "Adding additional sample:", + additional_samples["SampleName"].tolist()[0], + ) else: - print("Adding additional samples:", (", ").join(additional_samples['SampleName'].tolist())) - - samples_info['Lane'] = samples_info['Lane'].astype(int) - samples_info.sort_values(by=['Lane', 'SampleName'], inplace=True) - - updated_samplesheet = manifest_header + "\n[SAMPLES]\n" + samples_info.to_csv(index=False, header=True) + controls_info.to_csv(index=False, header=False) - output_path = manifest_path.replace('.csv', '_updated.csv') - with open(output_path, 'w') as output_file: + print( + "Adding additional samples:", + (", ").join(additional_samples["SampleName"].tolist()), + ) + + samples_info["Lane"] = samples_info["Lane"].astype(int) + samples_info.sort_values(by=["Lane", "SampleName"], inplace=True) + + updated_samplesheet = ( + manifest_header + + "\n[SAMPLES]\n" + + samples_info.to_csv(index=False, header=True) + + controls_info.to_csv(index=False, header=False) + ) + output_path = manifest_path.replace(".csv", "_updated.csv") + with open(output_path, "w") as output_file: output_file.write(updated_samplesheet) -if __name__ == '__main__': + +if __name__ == "__main__": main() From 9729e6f3ef05d64ce635b54f9ac2d17e2161c748 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Tue, 19 Aug 2025 11:17:30 +0200 Subject: [PATCH 13/13] Make printed output useable running notes --- aviti_index_fixer.py | 72 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 12 deletions(-) diff --git a/aviti_index_fixer.py b/aviti_index_fixer.py index 1a2e085..e2841c3 100644 --- a/aviti_index_fixer.py +++ b/aviti_index_fixer.py @@ -46,6 +46,33 @@ def reverse_complement_index(index): return "".join(NT_COMPLIMENT[nuc] for nuc in reversed(index)) +def print_running_note(project, change_type): + """Print a note about the changes being made.""" + if project: + project_text = "project " if len(project) == 1 else "projects " + project_text += ( + project[0] + if len(project) == 1 + else " and ".join(project) + if len(project) == 2 + else ", ".join(project) + ) + else: + project_text = "all samples" + if change_type == "swap": + print( + f"Index 1 and 2 in {project_text} were switched prior to re-demultiplexing." + ) + elif change_type == "rc1": + print( + f"Index 1 in {project_text} was converted to reverse complement prior to re-demultiplexing." + ) + elif change_type == "rc2": + print( + f"Index 2 in {project_text} was converted to reverse complement prior to re-demultiplexing", + ) + + @cli.command() @cli.option( "--manifest_path", @@ -79,41 +106,62 @@ def main(manifest_path, project, swap, rc1, rc2, add_sample): samples_info.loc[mask, "Index1"] = samples_info.loc[mask, "Index1"].apply( reverse_complement_index ) - print("Reverse complementing Index1") + print_running_note(project, "rc1") if rc2: samples_info.loc[mask, "Index2"] = samples_info.loc[mask, "Index2"].apply( reverse_complement_index ) - print("Reverse complementing Index2") + print_running_note(project, "rc2") if swap: samples_info.loc[mask, ["Index1", "Index2"]] = samples_info.loc[ mask, ["Index2", "Index1"] ].values - print("Swapping Index1 and Index2") + print_running_note(project, "swap") + if rc1 or rc2 or swap: # Update lims_label if any changes were made samples_info.loc[mask, "lims_label"] = ( samples_info.loc[mask, "Index1"] + "-" + samples_info.loc[mask, "Index2"] ) + + additional_samples_table = {} for additional_sample in add_sample: if os.path.isfile(additional_sample): additional_samples = pd.read_csv(additional_sample, header=None) additional_samples.columns = samples_info.columns + for _, row in additional_samples.iterrows(): + additional_samples_table[row["SampleName"]] = { + "index": f"{row['Index1']}-{row['Index2']}", + "lane": row["Lane"], + } else: additional_samples = pd.DataFrame( [additional_sample.split(",")], columns=samples_info.columns ) + additional_samples_table[additional_samples["SampleName"].tolist()[0]] = { + "index": f"{additional_samples['Index1'].tolist()[0]}-{additional_samples['Index2'].tolist()[0]}", + "lane": additional_samples["Lane"].tolist()[0], + } samples_info = pd.concat([samples_info, additional_samples], ignore_index=True) - if len(additional_samples) == 1: - print( - "Adding additional sample:", - additional_samples["SampleName"].tolist()[0], - ) - else: - print( - "Adding additional samples:", - (", ").join(additional_samples["SampleName"].tolist()), + if len(additional_samples_table) == 1: + sample_name = list(additional_samples_table.keys())[0] + print( + f"Sample {sample_name} with indexes", + f"{additional_samples_table[sample_name]['index']}", + f"was added to lane {additional_samples_table[sample_name]['lane']}", + "prior to re-demultiplexing.", + ) + elif len(additional_samples_table) > 1: + print( + "The following samples were added to the manifest prior to re-demultiplexing:\n" + "SampleName, Index1-Index2, Lane\n" + + "\n".join( + [ + f"{name}, {info['index']}, {info['lane']}" + for name, info in additional_samples_table.items() + ] ) + ) samples_info["Lane"] = samples_info["Lane"].astype(int) samples_info.sort_values(by=["Lane", "SampleName"], inplace=True)