This repository is dedicated to the reproduction of the manuscript titled "Paean: A unified and efficient transcriptome quantification system using heterogeneous computing."
The official source code is at the following link
The current reproduced environment and the minimal reproduced requirements are
| Resource | Current | Minimal |
|---|---|---|
| System | Ubuntu 20.04 | Ubuntu 20.04 |
| CPU | AMD Ryzen 9 5900X 12-Core Processor(24 cores) | A CPU with 24 cores |
| Memory | 125GB | 64GB |
| GPU | NVIDIA RTX 3090 | NVIDIA GTX 1080ti |
| CUDA | 12.4 | 9.2 |
We use Sequencing Quality Control (SEQC) dataset for gene expression quantification and Human Verified Sample (HVS) dataset for alternative splicing. All files of each dataset (12 files for SEQC and 6 files for HVS) can be downloaded from NCBI by the following Run IDs.
| Dataset | Run ID |
|---|---|
| SEQC | SRR896663 SRR896679 SRR896695 SRR896743 SRR896759 SRR896775 SRR896823 SRR896839 SRR896855 SRR896903 SRR896919 SRR896935 |
| HVS | SRR536342 SRR536344 SRR536346 SRR536348 SRR536350 SRR536352 |
To use Paean, you should build it from source. For complete details please follows Paean'repository.
First we should install some dependent packages. You should prepare an environment in which Ubuntu >= 20.04 and CUDA >= 9.2.
apt update
# zlib
apt install -y zlib1g-dev autoconf libcurl4-openssl-dev
# libdeflate
git clone --branch v1.21 https://github.com/ebiggers/libdeflate.git
cd libdeflate
cmake -B build && cmake --build build
cd build && make install
# htslib, download htslib-1.21.tar.bz2 from https://sourceforge.net/projects/samtools/files/samtools/1.21/
tar -xvf htslib-1.21.tar.bz2
cd htslib-1.21/
autoreconf -i
./configure --with-libdeflate
make -j && make installThen build Paean.
export PATH=/usr/local/cuda/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
git clone https://github.com/Bio-Acc/Paean.git
cd Paean/Paean-GPU
mkdir build && cd build
cmake .. -DCMAKE_BUILD_TYPE=Release
make -j && make installFor other compared software, install them via conda.
mkdir -p ~/miniconda3
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
rm ~/miniconda3/miniconda.shSince these softwares have incompatible dependencies, we need to install them using multiple conda envs.
# set conda env
source ~/miniconda3/bin/activate
conda create -n test -y python=3.8
conda activate test
conda install -y -c conda-forge -c bioconda samtools=1.21 \
hisat2=2.2.1 \
stringtie=2.2.3 \
salmon=1.9.0
# the following softwares are incompatible with other softwares
# rsem
conda create -n rsem -y python=3.7
conda activate rsem
conda install -y -c conda-forge -c bioconda rsem=1.3.3
# rmats
conda create -n rmats -y python=3.6
conda activate rmats
conda install -y -c conda-forge -c bioconda rmats=4.1.0
# misopy
conda create -n misopy -y python=2.7
conda activate misopy
conda install -y -c conda-forge -c bioconda misopy=0.5.4 pybedtools gffutils=0.8.7.1For kallisto because there exists a bug, we build kallisto from source.
git clone --branch v0.51.0 https://github.com/pachterlab/kallisto.git
cd kallisto
mkdir build && cd build
cmake .. -DENABLE_AVX2=OFF -DCOMPILATION_ARCH=OFF
# cannot parallelly compile
make && make installYou can download hg38 reference fasta and gff3/gtf files from gencode website
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_36/GRCh38.primary_assembly.genome.fa.gz
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_36/gencode.v36.annotation.gff3.gz
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_36/gencode.v36.transcripts.fa.gz
gunzip GRCh38.primary_assembly.genome.fa.gz
gunzip gencode.v36.annotation.gff3.gz
gunzip gencode.v36.transcripts.fa.gzFor other files, you can find in Paean'repository.
# hisat2 index
conda activate test
mkdir hisat2_index
hisat2-build -p 20 GRCh38.primary_assembly.genome.fa hisat2_index/hg38
# star index
conda activate rmats
mkdir star_index
STAR --runThreadN 20 --runMode genomeGenerate --genomeDir star_index/hg38 --genomeFastaFiles GRCh38.primary_assembly.genome.fa --sjdbGTFfile gencode.v36.annotation.gtf --sjdbOverhang 125For RNA-seq softwares which require bam data, we use hisat2 to do mapping.
hisat2 -x hisat2_index/hg38 -t -p 20 -dta -1 /path/to/SEQC_ILM_BGI_A_1_L01_ATCACG_AC0AYTACXX_1.fastq -2 /path/to/SEQC_ILM_BGI_A_1_L01_ATCACG_AC0AYTACXX_2.fastq -S /path/to/SEQC_ILM_BGI_A_1_L01_ATCACG_AC0AYTACXX.sam
samtools view -h /path/to/SEQC_ILM_BGI_A_1_L01_ATCACG_AC0AYTACXX.sam -o /path/to/SEQC_ILM_BGI_A_1_L01_ATCACG_AC0AYTACXX.bamcd Paean/Paean-GPU
time paean -g input/gene.annotation.gff3 -l input/length_table.csv -r /path/to/SEQC_ILM_BGI_A_1_L01_ATCACG_AC0AYTACXX.bam -x SE,A3SS -y input/csv/SE.annotation.csv,input/csv/A3SS.annotation.csv -o paean_result -t 20 -m 2# build index
conda activate rsem
mkdir resem_index
rsem-prepare-reference --gtf gencode.v36.annotation.gtf --star --star-path /home/lguanjiawen/miniconda3/envs/rmats/bin -p 20 --star-sjdboverhang 125 GRCh38.primary_assembly.genome.fa rsem_index/hg38
# run
time rsem-calculate-expression --paired-end --star --star-path /home/lguanjiawen/miniconda3/envs/rmats/bin -p 20 /path/to/SEQC_ILM_BGI_A_1_L01_ATCACG_AC0AYTACXX_1.fastq /path/to/SEQC_ILM_BGI_A_1_L01_ATCACG_AC0AYTACXX_2.fastq rsem_index/hg38 rsem_resultconda activate test
time samtools sort -@20 /path/to/SEQC_ILM_BGI_A_1_L01_ATCACG_AC0AYTACXX.bam -o /path/to/SEQC_ILM_BGI_A_1_L01_ATCACG_AC0AYTACXX.sort.bam
time stringtie -p 20 -o stringtie.out.gtf -G gencode.v36.annotation.gff3 /path/to/SEQC_ILM_BGI_A_1_L01_ATCACG_AC0AYTACXX.sort.bam# build index
kallisto index -i kallisto_index/hg38 GRCh38.primary_assembly.genome.fa
# run
time kallisto quant -i kallisto_index/hg38 -t 20 -o kallisto_result /path/to/SEQC_ILM_BGI_A_1_L01_ATCACG_AC0AYTACXX_1.fastq /path/to/SEQC_ILM_BGI_A_1_L01_ATCACG_AC0AYTACXX_2.fastqconda activate test
time salmon quant -t gencode.v36.transcripts.fa -l IU -p 20 -a /path/to/SEQC_ILM_BGI_A_1_L01_ATCACG_AC0AYTACXX.bam -o salmon_result/SEQC_ILM_BGI_A_1_L01_ATCACG_AC0AYTACXX# make two group file
echo "/path/to/SRR536342.bam,/path/to/SRR536344.bam,/path/to/SRR536346.bam" > b1.txt
echo "/path/to/SRR536348.bam,/path/to/SRR536350.bam,/path/to/SRR536352.bam" > b2.txt
# run
conda activate rmats
time rmats.py --b1 b1.txt --b2 b2.txt --gtf gencode.v36.annotation.gtf --bi start_index/hg38 -t paired --readLength 101 --nthread 20 --od rmats_result --tmp /tmp/tmp_outputPrepare
# separete alternative splicing event from annotation
# download UCSC tables
wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.ensGene.gtf.gz
gunzip hg38.ensGene.gtf.gz
# install converter tool
conda activate test
conda install -y -c conda-forge -c bioconda ucsc-gtftogenepred
gtfToGenePred -genePredExt -ignoreGroupsWithoutExons hg38.ensGene.gtf ensGene.txt
conda activate misopy
git clone https://github.com/yarden/rnaseqlib.git
# fix bug
sed -i 's/\["bin",/\[/g' /home/lguanjiawen/miniconda3/envs/misopy/lib/python2.7/site-packages/rnaseqlib/tables.py
# generate gff3 files of multiple alternative splicing event
python rnaseqlib/rnaseqlib/gff/gff_make_annotation.py ./ ./gff --flanking-rule commonshortest --genome-label hg38after generateing event gff3 files, we get the gff/commonshortest directory includes
SE.hg38.gff3
RI.hg38.gff3
A3SS.hg38.gff3
A5SS.hg38.gff3
MXE.hg38.gff3
Sort bam
conda activate test
time samtools sort -@20 /path/to/SRR536342.bam -o /path/to/SRR536342.sort.bam
time samtools index /path/to/SRR536342.sort.bamThen for each event, we run misopy
conda activate misopy
index_gff --index gff/commonshortest/SE.hg38.gff3 indexed/
# run
time miso --run indexed/ /path/to/SRR536342.sort.bam --output-dir miso_result --read-len 101 --paired-end 250 30 -p 20 --event-type SE