Skip to content

Commit c706f19

Browse files
Create snvs_comp_distr.r
1 parent 118832c commit c706f19

1 file changed

Lines changed: 147 additions & 0 deletions

File tree

snvs_comp_distr.r

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
.libPaths(c('~/Rlibs',.libPaths()))
2+
suppressWarnings(library('optparse'))
3+
4+
script.desc <-
5+
'Generates statistics to determine if SNV distribution has role in clustering.
6+
User must supply the RDS of a Seurat object (-r) which should contain a single
7+
Seurat experiment. A file also must be submitted with (-t) which is a file that
8+
contains a path to all the scReadCounts files containing the SNV information.'
9+
10+
parser <- OptionParser(description=script.desc)
11+
parser <- add_option(parser, c('-r', '--rds-file'),
12+
type='character',
13+
help='RDS file containing Seurat object.')
14+
parser <- add_option(parser, c('-t', '--snv-file'),
15+
type='character',
16+
help='scReadCounts output file containing SNVs.')
17+
parser <- add_option(parser, c('-w', '--th-snv-cells'),
18+
type='numeric', default=10,
19+
help=paste0('Threshold for maximum percentage of cells ',
20+
' that contain an SNV for bad reads. ',
21+
'(Default: 10)'))
22+
parser <- add_option(parser, c('-z', '--th-snv-reads'),
23+
type='numeric', default=1,
24+
help=paste0('Threshold for number of minimum reads to ',
25+
'qualify as SNV. (Default: 1)'))
26+
args <- parse_args(parser)
27+
28+
error.msg <- NULL
29+
# Check if Seurat object is passed
30+
if (is.null(args$`rds-file`))
31+
error.msg <- paste(error.msg, '- Seurat RDS object (-r) is required.',
32+
sep='\n')
33+
34+
# Check if SNV argument is passed
35+
if (is.null(args$`snv-file`))
36+
error.msg <- paste(error.msg, '- SNV data (-t) is required.', sep='\n')
37+
38+
if (args$`th-snv-reads`<=0)
39+
error.msg <- paste(error.msg, '- th-snv-reads needs to be greater than 0.',
40+
sep='\n')
41+
42+
# Check if there are any errors before proceeding
43+
if (!is.null(error.msg)) {
44+
print_help(parser)
45+
stop(error.msg)
46+
}
47+
48+
library('Seurat')
49+
srat <- readRDS(args$`rds-file`)
50+
snv <- read.table(args$`snv-file`, sep= '\t', header=T)
51+
# convert '-' to NA in VAF column (here VAF = ∞ or not defined as var, ref = 0)
52+
snv$VAF[snv$VAF == '-'] <- NA
53+
# convert VAF to numeric
54+
snv$VAF <- as.numeric(snv$VAF)
55+
56+
sample.name <- gsub('(.*/)*([A-Za-z0-9]+)_.*.rds', '\\2', args$`rds-file`)
57+
58+
# Filtering based on arguments passed
59+
# First filter (-x)
60+
#snv <- snv[snv$X.BadRead<=args$`th-bad-reads`, ]
61+
62+
# First filter (-w)
63+
snv.temp <- snv
64+
snv.temp['BadReadFlag'] = 0
65+
snv.temp[snv.temp[['X.BadRead']]>0, 'BadReadFlag'] = 1
66+
snv.read.filt <- aggregate(BadReadFlag~CHROM+POS+REF+ALT, data=snv.temp,
67+
function (x) 100*sum(x)/length(x))
68+
snv.read.filt <- snv.read.filt[snv.read.filt$BadReadFlag<=args$`th-snv-cells`,
69+
c('CHROM', 'POS', 'REF', 'ALT')]
70+
snv <- merge(snv, snv.read.filt, by=c('CHROM', 'POS', 'REF', 'ALT'))
71+
72+
# Second filter (-z)
73+
snv$VAF[snv$SNVCount<args$`th-snv-reads`] <- 0
74+
75+
if (nrow(snv)==0)
76+
stop('There are no rows left after filtering.')
77+
78+
79+
# Cluster
80+
df.cid <- as.data.frame(srat[['seurat_clusters']])
81+
df.cid <- data.frame(ReadGroup=rownames(df.cid),
82+
ClusterID=df.cid[, 1], row.names=NULL)
83+
n.cluster <- length(unique(df.cid$ClusterID))
84+
df.snv <- merge(snv, df.cid, by='ReadGroup')
85+
86+
# Calculate variance of VAF across the sample
87+
df.snv$VAF[df.snv$VAF>0] <- 1 # Change non-zero VAF to 1
88+
df.samp.stats <- aggregate(VAF~CHROM+POS+REF+ALT, data=df.snv,
89+
function (x) sum((x-mean(x, na.rm=T))^2, na.rm=T))
90+
colnames(df.samp.stats)[5] <- 'TotalSS'
91+
92+
93+
# Calculate estimate VAF for each cluster based on mean in a cluster
94+
df.est <- aggregate(VAF~CHROM+POS+REF+ALT+ClusterID, data=df.snv,
95+
function (x) mean(x, na.rm=T))
96+
colnames(df.est)[6] <- 'ModelVAF'
97+
98+
99+
# Calculate number of cells for each SNV
100+
df.n.int <- aggregate(VAF~CHROM+POS+REF+ALT, data=df.snv,
101+
function (x) sum(!is.na(x)))
102+
colnames(df.n.int)[5] <- 'N'
103+
104+
# Calculate model prediction
105+
df.clust.int <- merge(df.snv, df.est, by=c('CHROM', 'POS', 'REF', 'ALT',
106+
'ClusterID'))
107+
df.clust.int <- merge(df.clust.int, df.n.int, by=c('CHROM', 'POS', 'REF',
108+
'ALT'))
109+
df.clust.int <- df.clust.int[, c(1, 2, 3, 4, 5, 6, 15, 16, 17)]
110+
df.clust.int['ModelSS'] <- (df.clust.int['VAF']-df.clust.int['ModelVAF'])^2
111+
112+
df.clust.int['ModelSS'] <- df.clust.int['ModelSS']
113+
df.clust.sum <- aggregate(ModelSS~CHROM+POS+REF+ALT, data=df.clust.int,
114+
function (x) sum(x, na.rm=T))
115+
116+
df.final <- merge(df.samp.stats, df.clust.sum,
117+
by=c('CHROM', 'POS', 'REF', 'ALT'))
118+
df.final <- merge(df.final, df.n.int,
119+
by=c('CHROM', 'POS', 'REF', 'ALT'))
120+
121+
# Do basic filtering
122+
df.final['R2'] <- 1-df.final['ModelSS']/df.final['TotalSS']
123+
df.final['F'] <- ((df.final['TotalSS']-df.final['ModelSS'])/(n.cluster-1))/
124+
(df.final['ModelSS']/(df.final['N']-n.cluster))
125+
126+
# Do basic filtering
127+
df.final <- df.final[df.final$TotalSS!=0, ]
128+
df.final <- df.final[is.finite(df.final$F), ]
129+
df.final <- df.final[df.final$F>0, ]
130+
131+
# Calculate p-value
132+
df.final['p'] <- 1-pf(df.final$F, n.cluster-1,df.final$N-n.cluster)
133+
df.final['padj'] <- p.adjust(df.final$p, method='bonferroni',
134+
length(df.final$p))
135+
df.final <- df.final[order(df.final$padj, -df.final$F), ]
136+
137+
write.table(df.final, paste0(sample.name, '_snv_candidate_',
138+
sprintf('%02d', args$`th-snv-cells`), 'n',
139+
sprintf('%02d', args$`th-snv-reads`),
140+
'n.txt'),
141+
sep='\t', row.names=F)
142+
df.final <- df.final[df.final$padj < 0.1, ]
143+
write.table(df.final, paste0(sample.name, '_snv_candidate_',
144+
sprintf('%02d', args$`th-snv-cells`), 'n',
145+
sprintf('%02d', args$`th-snv-reads`),
146+
'n_q01.txt'),
147+
sep='\t', row.names=F)

0 commit comments

Comments
 (0)