-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathblwc.cpp
More file actions
152 lines (126 loc) · 4.25 KB
/
blwc.cpp
File metadata and controls
152 lines (126 loc) · 4.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/*
* Unix wc command, but for biological sequences.
*
*/
#include <iostream>
#include <queue>
#include <string>
#include <vector>
#include <seqan/seq_io.h>
#include <tclap/CmdLine.h>
#include <SeqFileInWrapper.h>
using std::cerr;
using std::cin;
using std::cout;
using std::endl;
using std::ifstream;
using std::istream;
using std::queue;
using std::string;
using std::vector;
using namespace seqan;
using namespace bltools;
int main(int argc, char * argv[]) {
TCLAP::CmdLine cmd("Equivalent of `wc' for sequence files", ' ', "0.0");
TCLAP::SwitchArg rec_count_arg("m", "length",
"Give the length of each record", cmd);
TCLAP::SwitchArg gc_arg("g", "gc",
"Give the GC proportion (of file or of each record with -m)",
cmd);
TCLAP::SwitchArg include_gap_arg("i", "include-gap",
"Include gaps ('-') in the base count", cmd);
TCLAP::SwitchArg report_total("b", "total-bases",
"Total bases per file (not compatible with -g or -m)", cmd);
TCLAP::SwitchArg report_grand_total("B", "grand-total-bases",
"Total bases across all files (not compatible with -g or -m)", cmd);
TCLAP::UnlabeledMultiArg<string> files("FILE(s)", "filenames", false,
"file name(s)", cmd, false);
cmd.parse(argc, argv);
bool include_gaps = include_gap_arg.getValue();
bool rec_count = rec_count_arg.getValue();
bool gc = gc_arg.getValue();
bool tot_bases = report_total.getValue();
bool gtot_bases = report_grand_total.getValue();
vector<string> infiles = files.getValue();
if(infiles.size() == 0) infiles.push_back("-");
if((tot_bases || gtot_bases) && (rec_count || gc)) {
cerr << "Error: Cannot count total bases and get length per record or GC" << endl;
return 1;
}
CharString id;
CharString seq; // CharString more flexible than Dna5String
SeqFileInWrapper seq_handle;
unsigned base_count = 0;
unsigned total_base_count = 0;
unsigned grand_total_base_count = 0;
unsigned gc_count = 0;
for(string& infile: infiles) {
total_base_count = 0;
base_count = 0;
gc_count = 0;
try {
seq_handle.open(infile);
} catch(Exception const &e) {
cerr << "Error: Could not open " << infile << endl;
seq_handle.close();
return 1;
}
int nrecs_read = 0;
while(!seq_handle.atEnd()) {
try {
readRecord(id, seq, seq_handle.sqh);
nrecs_read++;
} catch (Exception const &e) {
cerr << "Error: " << e.what() << endl;
seq_handle.close();
return 1;
} // End try-catch for record reading.
if(gc || rec_count || tot_bases || gtot_bases) {
// TODO Use the size() or length() or w/e function to get the base_count
// unless you want to avoid counting gaps
for(char& b: seq) {
if(include_gaps || b != '-') {
base_count += 1;
if(gc) {
if(b == 'G' || b == 'C' || b == 'g' || b == 'c') {
gc_count += 1;
}
}
}
}
}
if(rec_count || tot_bases || gtot_bases) {
if(gc) {
cout << infile << "\t" << id << "\t" << ((double)gc_count) / (base_count) << endl;
} else if(rec_count) {
cout << infile << "\t" << id << "\t" << base_count << endl;
}
if(tot_bases) {
total_base_count += base_count;
}
if(gtot_bases) {
grand_total_base_count += base_count;
}
gc_count = 0;
base_count = 0;
} // End rec_count output
} // End single file reading loop
if(!seq_handle.close()) {
cerr << "Error: Problem closing " << infile << endl;
return 1;
}
if(!rec_count) {
if (tot_bases) {
cout << infile << "\t" << total_base_count << endl;
} else if(gc) {
cout << infile << "\t" << ((double)gc_count) / (base_count) << endl;
} else {
cout << infile << "\t" << nrecs_read << endl;
}
}
} // End loop over files
if(gtot_bases) {
cout << "GRAND_TOTAL_BASES" << "\t" << grand_total_base_count << endl;
}
return 0;
}