forked from jccastrog/pa_genomics
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_MInetwork.R
More file actions
executable file
·162 lines (157 loc) · 7.48 KB
/
run_MInetwork.R
File metadata and controls
executable file
·162 lines (157 loc) · 7.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env Rscript
################################################################################
# Name: run_MInetwork.R
# Author: Juan C. Castro [jcastro37@gatech.edu]
# Georgia Institute of Technology
#
# Version: 1.0
# Date: 07-Aug-2019
# License: GNU General Public License v3.0.
# ==============================================================================
#
################################################################################
#======================= 0.0 Install required packages ========================#
personal.lib.path = Sys.getenv("R_LIBS_USER")
if(!file.exists(personal.lib.path))
dir.create(personal.lib.path)
packages <- c("entropy", "optparse")
if(any(!(packages %in% installed.packages()))){
cat("Please wait a moment! Installing required packages ...\n")
install.packages(packages[!(packages %in% installed.packages())],
quiet = T, repos="http://cran.rstudio.com/",
lib = personal.lib.path)
cat("Required packages installed!\n")
}
#======== 1.0 Load packages, define functions, and initialize variable========#
# 1.1 Load packages ==========================================================#
suppressPackageStartupMessages(library(entropy))
suppressPackageStartupMessages(library(optparse))
# 1.2 Define functions =======================================================#
#' Estimate mutual information (MI) for all pairs of variables in an expression
#' matrix
#'
#' @param matrix.data A matrix with expression data where columns are genes and
#' rows are time points
#' @param num.genes The number of genes in the matrix
#' @return mutual.mat A matrix with values of mutual information for all pair of
#' genes
#' @author Juan C. Castro \email{jcastro37@gatech.edu}
mutualInfoEst <- function(matrix.data,num.genes) {
mutual.mat <- matrix(ncol=num.genes, nrow=num.genes)
for (i in 1:num.genes) {
for (j in 1:num.genes) {
if (i>j) {
discret.vec <- cbind(matrix.data[,i],matrix.data[,j])
mutual.mat[i,j] <- suppressWarnings(mi.empirical(discret.vec,unit=c("log2")))
}
}
}
return(mutual.mat)
}
#' Estimate a null distribution of MI for variables in an expression matrix
#'
#' @param matrix.data A matrix with expression data where columns are genes and
#' rows are time points
#' @param num.genes The number of genes in the matrix
#' @param randomizations Number of times to randomize expression values
#' @return dist.mat A matrix with values of mutual information for a randomized
#' expression matrix where each column is a linearized version of a random
#' expression matrix
#' @author Juan C. Castro \email{jcastro37@gatech.edu}
nullInfoDist <- function(matrix.data, num.genes, randomizations){
ncol.mat <- (num.genes*(num.genes-1))/2
dist.mat <- matrix(ncol = ncol.mat, nrow = randomizations)
for (counts in 1:randomizations) {
random.matrix.data <- matrix(ncol = ncol(matrix.data), nrow = nrow(matrix.data))
for (i in 1:ncol(matrix.data)){
random.matrix.data[,i] <- sample(matrix.data[,i])
}
random.mi.mat <- mutualInfoEst(random.matrix.data, num.genes)
random.mi.vec <- random.mi.mat[lower.tri(random.mi.mat)]
dist.mat[counts,] <- random.mi.vec
}
return(dist.mat)
}
#' Calculate a score for each value of mutual information in an MI matrix
#'
#' @param initial.matrix A matrix with MI values calculated form an expression
#' matrix
#' @param null.dist.maxtrix A matrix with null values of MI obatined with nullInfoDist
#' @param num.genes The number of genes in the matrix
#' @param randomizations Number of times to randomize expression values
#' @return p.mat A matrix with pValues of for the MI values in initialMatrix
#' @author Juan C. Castro \email{jcastro37@gatech.edu}
infoScore <- function(initial.matrix, null.dist.matrix, num.genes, randomizations){
mat.size <- (num.genes*(num.genes-1))/2
sum.dist <- randomizations
lineal.ini.mat <- initial.matrix[lower.tri(initial.matrix)]
p.vec <- c()
for (i in 1:mat.size){
mi.counts <- sum(null.dist.matrix[,i] >= lineal.ini.mat[i])
p.score <- mi.counts/sum.dist
p.vec[i] <- p.score
}
p.mat <- matrix(0, ncol = num.genes, nrow = num.genes)
p.mat[lower.tri(p.mat)] <- p.vec
return(p.mat)
}
#1.3 Initialize variables ===================================================#
# 1.3.1 Parser variables #
# Get script name
initial.options <- commandArgs(trailingOnly = FALSE)
script.name <- basename(sub("--file=", "", initial.options[grep("--file=", initial.options)]))
# Process command line arguments
# Create a parser
option_list = list(
make_option(c("-p", "--pangenome_matrix"), type = "character", default = NULL,
help ="The OGs matrix that contains the precense abcense values for each gene in each genome.", metavar = "character"),
make_option(c("-l", "--ogs_list"), type = "character", default = NULL,
help ="A file with a list with the OG name from which the network is to be built.", metavar = "character"),
make_option(c("-n", "--num_reps"), type = "numeric", default = 100000,
help ="Number of repetitions to calculate the null MI distribution.", metavar = "numeric"),
make_option(c("-o", "--output"), type = "character", default = "network.tsv",
help ="Name of the output file, the output is stored as a tab separated table of either the OGs matrix (binary or numerical) or the rarefraction curve.", metavar = "character")
)
# Add command line arguments
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
opt_names <- names(opt)
if (is.null(opt$pangenome_matrix)){
print_help(opt_parser)
err_str <- 'Argument missing "-p" --pangenome_matrix" must be provided.\n'
stop(err_str, call.=FALSE)
} else if (is.null(opt$ogs_list)){
print_help(opt_parser)
err_str <- 'Argument missing "-l" --ogs_list" must be provided.\n'
stop(err_str, call.=FALSE)
}
# Parse the command line arguments
pangenome.matrix <- opt$pangenome_matrix
ogs.list <- opt$ogs_list
num.reps <- opt$num_reps
output <- opt$output
#===================== 2.0 Format original data as matrix =====================#
# 2.1 Load data ===============================================================#
ogs.df <- read.table(file = pangenome.matrix, header = T, row.names = 1, stringsAsFactors = F)
ogs.list <- read.table(file = ogs.list, header = F, stringsAsFactors = F)
# 2.2 Subset the data to the OGs of the list
sub.df <- ogs.df[ogs.list$V1,]
matrix.data <- t(sub.df)
num.genes <- ncol(matrix.data)
#====================== 3.0 Create the graph based on MI ======================#
# 3.1 Estimate MI and pvalues
initial.mi <- mutualInfoEst(matrix.data = matrix.data, num.genes = num.genes)
null.mi <- nullInfoDist(matrix.data = matrix.data, num.genes = num.genes, randomizations = num.reps)
pvals.mi <- infoScore(initial.matrix = initial.mi, null.dist.matrix = null.mi, num.genes = num.genes, randomizations = num.reps)
# 3.2 Store edges as a data.frame object
edge.list <- data.frame(OG1 = c(), OG2 = c(), MI = c(), pVal = c())
for (i in 1:num.genes){
for (j in 1:num.genes)
if(i>j){
loc.df <- data.frame (OG1 = ogs.list[i,1], OG2 = ogs.list[j,1], MI = initial.mi[i,j], pVal = pvals.mi[i,j])
edge.list <- rbind(edge.list, loc.df)
}
}
#===================== 4.0 Write the edge list of a file ======================#
write.table(x = edge.list, file = output, quote = F, sep = '\t', row.names = F)
#==============================================================================#