-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSnakefile
More file actions
128 lines (116 loc) · 5.3 KB
/
Snakefile
File metadata and controls
128 lines (116 loc) · 5.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
shell.prefix("source ~/.bash_profile; set -euo pipefail;")
#wildcard rule for samples directory
SAMPLES, = glob_wildcards(config['datadirs']['bam'] + "/" + "{sample}_Aligned.sortedByCoord.out.md.bam")
rule all:
input:
expand(config['datadirs']['bai'] + "/" + "{sample}_Aligned.sortedByCoord.out.md.bam.bai", sample=SAMPLES),
expand(config['datadirs']['verifybam'] + "/" + "{sample}.selfSM", sample=SAMPLES),
expand(config['datadirs']['parse'] + "/" + "{sample}-results.txt", sample=SAMPLES),
config['datadirs']['verifybamFinal'] + "/" + "verify.txt"
rule index:
input:
bam = config['datadirs']['bam'] + "/" + "{sample}_Aligned.sortedByCoord.out.md.bam"
output:
bai = config['datadirs']['bai'] + "/" + "{sample}_Aligned.sortedByCoord.out.md.bam.bai"
shell: """
samtools index {input.bam} {ouput.bai}
"""
rule verifybam_id:
input:
vcf = config['reference']['vcf'][['hg38'],
bam = config['datadirs']['bam'] + "/" + "{sample}_Aligned.sortedByCoord.out.md.bam".
index = config['datadirs']['bai'] + "/" + "{sample}_Aligned.sortedByCoord.out.md.bam.bai"
output:
selfSM = config['datadirs']['verifybam'] + "/" + "{sample}.selfSM",
bestSM = config['datadirs']['verifybam'] + "/" + "{sample}.bestSM",
depthSM = config['datadirs']['verifybam'] + "/" + "{sample}.depthSM"
params:
prefix = config['datadirs']['verifybam'] + "/" + "{sample}",
individual = "{sample}"
resources:
mem_mb = 30000
shell: """
verifyBamID --vcf {input.vcf} --bam {input.bam} --best --ignoreRG --smID {params.individual} --out {params.prefix} --verbose
"""
rule parse_verify:
input:
selfSM = config['datadirs']['verifybam'] + "/" + "{sample}.selfSM",
bestSM = config['datadirs']['verifybam'] + "/" + "{sample}.bestSM",
depthSM = config['datadirs']['verifybam'] + "/" + "{sample}.depthSM"
output: config['datadirs']['parse'] + "/" + "{sample}-results.txt"
run:
selfSM = open(input.selfSM, "rt")
bestSM = open(input.bestSM, "rt")
depthSM = open(input.depthSM, "rt")
results = open(output[0], "w")
for line in selfSM:
# Confirm the header columns
if line[0] == "#":
cols = line.strip().split("\t")
assert cols[0] == "#SEQ_ID" and cols[2] == "CHIP_ID" and \
cols[3] == "#SNPS" and cols[4] == "#READS" and \
cols[5] == "AVG_DP" and cols[6] == "FREEMIX" and \
cols[11] == "CHIPMIX", "selfSM columns are as expected"
else:
cols = line.strip().split("\t")
seq_id = cols[0]
chip_id = cols[2]
chip_id == seq_id
snps = cols[3]
reads = cols[4]
avg_dp = cols[5]
selfSM_freemix = cols[6]
selfSM_chipmix = cols[11]
for line in bestSM:
# Confirm the header columns
if line[0] == "#":
cols = line.strip().split("\t")
assert cols[0] == "#SEQ_ID" and cols[2] == "CHIP_ID" and \
cols[3] == "#SNPS" and cols[4] == "#READS" and \
cols[5] == "AVG_DP" and cols[6] == "FREEMIX" and \
cols[11] == "CHIPMIX", "bestSM columns are as expected"
else:
cols = line.strip().split("\t")
seq_id = cols[0]
bestSM_chip_id = cols[2]
match = str(bestSM_chip_id == seq_id).upper()
bestSM_freemix = cols[6]
bestSM_chipmix = cols[11]
# Report the number of SNPs that had more than a minimum read depth
depth_min = 10
snps_w_min = 0
for line in depthSM:
# Confirm the header columns
if line[0] == "#":
cols = line.strip().split("\t")
assert cols[1] == "DEPTH" and cols[2] == "#SNPs", \
"depthSM columns are as expected"
else:
cols = line.strip().split("\t")
depth = int(cols[1])
n_snps = int(cols[2])
if depth >= depth_min:
snps_w_min = snps_w_min + n_snps
out_header = ["id", "best", "match",
"self_freemix", "self_chipmix",
"best_freemix", "best_chipmix",
"snps", "reads", "avg_dp",
"min_dp", "snps_w_min"]
out_cols = [seq_id, bestSM_chip_id, match,
selfSM_freemix, selfSM_chipmix,
bestSM_freemix, bestSM_chipmix,
snps, reads, avg_dp,
str(depth_min), str(snps_w_min)]
results.write("\t".join(out_header) + "\n")
results.write("\t".join(out_cols) + "\n")
selfSM.close()
bestSM.close()
depthSM.close()
results.close()
rule combine_verify:
input:
files = expand(config['datadirs']['parse'] + "/" + "{sample}-results.txt", sample = SAMPLES)
output: config['datadirs']['verifybamFinal'] + "/" + "verify.txt"
shell:
"head -n 1 {input[0]} > {output};"
"cat {input.files} | grep -v \"id\" | sort -k1n >> {output}"