ensemble_integration/train_base.py at master · GauravPandeyLab/ensemble_integration · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
'''
	Scripts to train base classifiers in a nested cross-validation structure by Weka.
	See README.md for detailed information.
	@author: Yan-Chak Li, Linhua Wang
'''
from os.path import isdir
from os import listdir
import argparse
from itertools import product
from os import system
import os
from os.path import abspath, dirname, exists
from sys import argv
from processing_scripts.common import load_properties, read_arff_to_pandas_df
from time import time
from processing_scripts import generate_data
import numpy as np


def str2bool(v):
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')

def create_pseudoTestdata(data_dir, feat_folders, original_dir):
    new_feat_folders = []

    os.system('cp {} {}'.format(os.path.join(original_dir, 'classifiers.txt'),
                                data_dir))
    os.system('cp {} {}'.format(os.path.join(original_dir,'weka.properties'),
                                data_dir))
    for ff in feat_folders:
        feat_df = read_arff_to_pandas_df(os.path.join(ff, 'data.arff'))
        feat_df_drop_idx = feat_df.drop(columns=['fold', 'seqID', 'cls'])
        real_val_cols = []
        feature_cols = feat_df_drop_idx.columns
        real_val_cols = []
        for i_col in feature_cols:
            if len(feat_df_drop_idx[i_col].unique()) > 2:
                real_val_cols.append(i_col)
        feat_df.loc[:, 'fold'] = 'trainModel'
        # create 20 pseudo test entries
        df_shape = feat_df.shape
        number_of_real_col = len(real_val_cols)
        for i in range(20):
            ri = i + df_shape[0]
            feat_df.loc[ri] = np.random.binomial(size=df_shape[1], n=1, p=0.5)
            # number_of_real_col = len(real_val_cols)
            # feat_df.loc[ri] = np.random.randn(number_of_real_col)
            feat_df.loc[ri, real_val_cols] = np.random.randn(number_of_real_col)
            feat_df.loc[ri, 'fold'] = 'pseudoTest'
            feat_df.loc[ri, 'seqID'] = i
            if (i % 2) == 0:
                c = 'pos'
            else:
                c = 'neg'
            feat_df.loc[ri, 'cls'] = c
        new_feat_dir = os.path.join(data_dir, ff.split('/')[-1])
        if not exists(new_feat_dir):
            os.mkdir(new_feat_dir)
            # feat_df['fold'] = feat_df['fold'].astype(int)
            os.system('cp {} {}'.format(os.path.join(original_dir, 'classifiers.txt'),
                                        new_feat_dir))
            os.system('cp {} {}'.format(os.path.join(original_dir, 'weka.properties'),
                                        new_feat_dir))
            generate_data.convert_to_arff(feat_df, os.path.join(new_feat_dir, 'data.arff'))
        new_feat_folders.append(new_feat_dir)
    return data_dir, new_feat_folders

if __name__ == "__main__":
    ### parse arguments
    parser = argparse.ArgumentParser(description='Feed some bsub parameters')
    parser.add_argument('--path', '-P', type=str, required=True, help='data path')
    parser.add_argument('--queue', '-Q', type=str, default='premium', help='LSF queue to submit the job')
    parser.add_argument('--node', '-N', type=str, default='32', help='number of node requested')
    parser.add_argument('--time', '-T', type=str, default='40:00', help='number of hours requested')
    parser.add_argument('--memory', '-M', type=str, default='16000', help='memory requsted in MB')
    parser.add_argument('--classpath', '-CP', type=str, default='groovy_scripts/weka.jar', help='default weka path')
    parser.add_argument('--hpc', type=str2bool, default='true', help='use HPC cluster or not')
    parser.add_argument('--fold', '-F', type=int, default=5, help='number of cross-validation fold')
    parser.add_argument('--rank', type=str2bool, default='False', help='get attribute importance')
    parser.add_argument('--writeModel', type=str2bool, default='False', help='write model or not')
    # parser.add_argument('--create_rank_dir', type=str2bool, default='False', help='getting attribute importance')
    args = parser.parse_args()
    ### record starting time
    start = time()
    ### get the data path
    data_path = abspath(args.path)
    data_source_dir = data_path.split('/')[-2]
    data_name = data_path.split('/')[-1]
    working_dir = dirname(abspath(argv[0]))

    ### get weka properties from weka.properties
    p = load_properties(data_path)
    bag_values = range(int(p['bagCount']))

    ### get the list of base classifiers
    classifiers_fn = data_path + '/classifiers.txt'
    assert exists(classifiers_fn)
    classifiers = filter(lambda x: not x.startswith('#'), open(classifiers_fn).readlines())
    classifiers = [_.strip() for _ in classifiers]

    ### get paths of the list of features
    fns = listdir(data_path)
    excluding_folder = ['analysis', 'feature_rank', 'model_built']
    fns = [fn for fn in fns if not fn in excluding_folder]
    fns = [fn for fn in fns if not 'tcca' in fn]
    fns = [data_path + '/' + fn for fn in fns]
    feature_folders = [fn for fn in fns if isdir(fn)]

    if args.rank or args.writeModel:
        model_built_path = os.path.join(data_path, 'model_built')
        if not exists(model_built_path):
            os.mkdir(model_built_path)
        data_path, feature_folders = create_pseudoTestdata(model_built_path,
                                                           feature_folders,
                                                           original_dir=data_path,
                                                           # create_rank_dir=args.create_rank_dir,
                                                           )


    # ### get paths of the list of features
    # fns = listdir(data_path)
    # excluding_folder = ['analysis', 'feature_rank']
    # fns = [fn for fn in fns if not fn in excluding_folder]
    # fns = [fn for fn in fns if not 'tcca' in fn]
    # fns = [data_path + '/' + fn for fn in fns]
    # feature_folders = [fn for fn in fns if isdir(fn)]


    # assert len(feature_folders) > 0

    # get fold, id and label attribute


    if 'foldAttribute' in p:
        df = read_arff_to_pandas_df(feature_folders[0] + '/data.arff')
        fold_values = list(df[p['foldAttribute']].unique())
    else:
        fold_values = range(int(p['foldCount']))

    # if writeModel:
    #     fold_values = ['test']
    id_col = p['idAttribute']
    label_col = p['classAttribute']
    jobs_fn = "temp_train_base_{}_{}.jobs".format(data_source_dir, data_name)
    job_file = open(jobs_fn, 'w')
    if not args.hpc:
        job_file.write('module load groovy\n')


    def preprocessing(jf):
        classpath = args.classpath
        all_parameters = list(product(feature_folders, classifiers, fold_values, bag_values))

        for parameters in all_parameters:
            project_path, classifier, fold, bag = parameters
            jf.write('groovy -cp %s %s/groovy_scripts/base_predictors.groovy %s %s %s %s %s %s %s\n' % (
                classpath, working_dir, data_path, project_path, fold, bag, args.rank, args.writeModel, classifier))

        if not args.hpc:
            jf.write('python processing_scripts/combine_individual_feature_preds.py %s %s %s\npython processing_scripts/combine_feature_predicts.py %s %s %s\n' % (
                data_path, args.rank, 'False',
                data_path, args.rank, 'False'))

        return jf

    job_file = preprocessing(job_file)
    job_file.close()

    ### submit to hpc if args.hpc != False
    if args.hpc:
        lsf_fn = 'run_%s_%s.lsf' % (data_source_dir, data_name)
        fn = open(lsf_fn, 'w')
        fn.write(
            '#!/bin/bash\n#BSUB -J EI-%s\n#BSUB -P acc_pandeg01a\n#BSUB -q %s\n#BSUB -n %s\n#BSUB -W %s\n#BSUB -o %s.stdout\n#BSUB -eo %s.stderr\n#BSUB -R rusage[mem=%s]\n' % (
            # '#!/bin/bash\n#BSUB -J EI-%s\n#BSUB -P acc_pandeg01a\n#BSUB -q %s\n#BSUB -n %s\n#BSUB -W %s\n#BSUB -o %s.stdout\n#BSUB -eo %s.stderr\n#BSUB -R himem\n' % (
            data_name, args.queue, args.node, args.time, data_source_dir, data_source_dir, args.memory))
        fn.write('module load java\nmodule load python\nmodule load groovy\nmodule load selfsched\nmodule load weka\n')
        fn.write('export _JAVA_OPTIONS="-XX:ParallelGCThreads=10"\nexport JAVA_OPTS="-Xmx{}g"\nexport CLASSPATH={}\n'.format(int(float(args.memory)/1024)-1,args.classpath))

        fn.write('mpirun selfsched < {}\n'.format(jobs_fn))
        fn.write('rm {}\n'.format(jobs_fn))
        fn.write('python processing_scripts/combine_individual_feature_preds.py %s %s %s\npython processing_scripts/combine_feature_predicts.py %s %s %s\n' % (
        data_path, args.rank, 'False',
        data_path, args.rank, 'False'))
        fn.close()
        system('bsub < %s' % lsf_fn)
        system('rm %s' % lsf_fn)

    ### run it sequentially otherwise
    else:
        system('sh %s' % jobs_fn)
        system('rm %s' % jobs_fn)
    end = time()
    if not args.hpc:
        print('Elapsed time is: %s seconds' % (end - start))