-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPutHeaderAndSeqInDB_AllFiles.m
More file actions
86 lines (80 loc) · 3.05 KB
/
PutHeaderAndSeqInDB_AllFiles.m
File metadata and controls
86 lines (80 loc) · 3.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
%% Script PutHeaderAndSeqInDB_AllFiles Put many fasta files in DB
%Fastadir = 'testProtein1';
Fastadir = 'dirStore';
%Matdir = 'dirMat';
%Fastafile = 'gbenv1._aas.cut';
DB = DBserver('localhost:2181','Accumulo','instance', 'root','secret');
DoDB = true; % Use DB or in-memory Assoc
DoDBInfo = true;
DoDisp = false;
DoSaveMat = false;
DoSaveStats = true;
DoDeleteDB = false; % Delete pre-existing tables.
DoPutHeader = true;
DoPutRawSequence = true;
Tablebase = 'Tseq';
BytesLimit = 5e5; % Size before sending to server
LargestSequence = 11000;
LargestMeta = 6000;
Skip = 0;%126; % to file 127
PARALLEL = true;
% eval(pRUN('PutHeaderAndSeqInDB_AllFiles',4,{}))
% Ideas:
% -Check if a file has info in Tinfo. If yes, that file is complete. If no, ingest it.
% (note: will not help with degree tables)
% -PMatlab on different files.
% -Preallocate arrays so they don't change size every iteration.
% -Presum degree counts using associative array.
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
if DoDBInfo
if DoDeleteDB
prompt = input('Are you sure you want to create new tables and delete old ones? [y/n]','s');
if ~strcmpi(prompt,'y')
return
end
clear prompt
Tinfo = DB([Tablebase 'Info']);
deleteForce(Tinfo);
end
Tinfo = DB([Tablebase 'Info']);
else
Tinfo = Assoc('','','');
end
files = dir([Fastadir filesep '*._aas']);
numfiles = size(files,1);
if PARALLEL
myfiles = global_ind(zeros(numfiles-Skip,1,map([Np 1],{},0:Np-1)));
else
myfiles = 1:(numfiles-Skip)
end
disp(myfiles);
nl = char(10);
cumTotalTime = 0;
FLOG = fopen([Fastadir filesep 'PutHeaderAndSeqInDB_AllFiles.' num2str(Np) '.log'],'w');
for origi = myfiles%1:numfiles-Skip
i = origi+Skip;
Fastafile = deblank(files(i).name);
if numel(Fastafile) < 5 || ~strcmp('gb',Fastafile(1:2)) || ~strcmp('_aas',Fastafile(end-3:end))
continue
end
fprintf('[%s %4d/%04d] Processing: %s\t',datestr(now),i,numfiles,Fastafile);
fprintf(FLOG,'[%s %4d/%04d] Processing: %s\t',datestr(now),i,numfiles,Fastafile);
tic;
PutHeaderAndSeqInDB(DB,DoDB,DoDisp,DoSaveMat,DoDeleteDB,...
DoPutHeader,DoPutRawSequence,Tablebase,BytesLimit,LargestSequence,LargestMeta,...
Fastadir,Fastafile,DoSaveStats,DoDBInfo);
putTime = toc;
cumTotalTime = cumTotalTime + putTime;
Np=1;
%disp(['Extrapolated total run time (totalTime*Numfiles/Np/3600): ' num2str(totalTime*Numfiles/Np/3600)]);
%disp(['Cummulative Extrapolated total run time (cumTotalTime*numfiles/fileNum/Np/3600): ' num2str(cumTotalTime*numfiles/i/Np/3600)]);
fprintf('Expected finish %s\n',num2str(cumTotalTime*numfiles/i/Np/3600));
fprintf(FLOG,'Expected finish %s\n',num2str(cumTotalTime*numfiles/i/Np/3600));
% if DoDB
% putTriple(Tinfo,[Fastafile nl],sprintf('putTime|%010.2f\n',putTime),'1\n');
% else
% Tinfo = Tinfo + Assoc([Fastafile nl],sprintf('putTime|%010.2f\n',putTime),'1\n');
% end
DoDeleteDB = false;
end
fclose(FLOG);