IR2Vec-Classification/preprocess_embed.py at main · IITH-Compilers/IR2Vec-Classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Updated script for handling separate folders for train, test, and val

# python preprocess.py --train path/to/train --test path/to/test --val path/to/val --output path/to/output

# Instructions

# python preprocess.py [options]
# [options]
# --train: Path of the train data file
# --test: Path of the test data file
# --val: Path of the val data file
# --output: Path for processed CSV files

# Structure of the input data
# label<\t>vector_dim1<\t>vector_dim2<\t>.......<\t>vector_dimN

# For spliting the data
# python preprocess.py --train <PATH of the train data file> --test <PATH of the test data file> --val <PATH of the val data file> --output <PATH for processed CSV files>

# ------------------------------------------------------------------------------------------

import argparse
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from collections import Counter

def load_data(filepath):
    lines = [line.strip('\n\t') for line in open(filepath)]
    rep, targetLabel = [], []
    flag = 0
    for line in lines:
        if flag == 0:
            flag = 1
            continue
        else:
            r = line.split('\t')
            targetLabel.append(int(r[0]))
            res_double = [float(val) for val in r[1:]]
            rep.append(res_double)
    X = pd.DataFrame(rep)
    return X, targetLabel

def save_to_file(X, Y, filepath):
    X = pd.DataFrame(X)
    Y = pd.DataFrame(Y)
    temp = pd.concat([Y, X], axis=1)
    temp.columns = range(temp.shape[1])
    temp.to_csv(filepath, header=None, index=False, sep='\t')

def process_and_save(data_path, output_path, filename):
    if not os.path.exists(data_path):
        print(f"Warning: Data file not found at {data_path}")
        return

    X, Y = load_data(data_path)
    print(f"Loaded data from {data_path}: X.shape={len(X)}, Y.shape={len(Y)}")
    save_to_file(X, Y, os.path.join(output_path, filename))
    print(f"Data saved to {os.path.join(output_path, filename)}")

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--train', required=True, help='Path to the training data file')
    parser.add_argument('--test', required=True, help='Path to the testing data file')
    parser.add_argument('--val', required=True, help='Path to the validation data file')
    parser.add_argument('--output', required=True, help='Output directory for processed CSV files')

    args = parser.parse_args()

    if not os.path.exists(args.output):
        os.makedirs(args.output)

    process_and_save(args.train, args.output, 'train.csv')
    process_and_save(args.test, args.output, 'test.csv')
    process_and_save(args.val, args.output, 'val.csv')