LLMCodingHallucination/run_pipeline.py at main · DeepSoftwareAnalytics/LLMCodingHallucination · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import os
from pathlib import Path
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from make_window import MakeWindowWrapper
from build_vector import BuildVectorWrapper, BagOfWords
from search_code import CodeSearchWrapper
from build_prompt import BuildPromptWrapper

from utils import CONSTANTS, CodexTokenizer


def make_repo_window(repos, window_sizes, slice_sizes):
    worker = MakeWindowWrapper(None, repos, window_sizes, slice_sizes)
    worker.window_for_repo_files()


def run_RG1_and_oracle_method(benchmark, repos, window_sizes, slice_sizes):
    # # build code snippets for all the repositories
    # make_repo_window(repos, window_sizes, slice_sizes)
    # # build code snippets for vanilla retrieval-augmented approach and ground truth
    # MakeWindowWrapper(benchmark, repos, window_sizes, slice_sizes).window_for_baseline_and_ground()
    # # build vector for vanilla retrieval-augmented approach and ground truth
    # vectorizer = BagOfWords
    # BuildVectorWrapper(benchmark, vectorizer, repos, window_sizes, slice_sizes).vectorize_repo_windows()
    # BuildVectorWrapper(benchmark, vectorizer, repos, window_sizes, slice_sizes).vectorize_baseline_and_ground_windows()
    # search code for vanilla retrieval-augmented approach and ground truth
    CodeSearchWrapper('one-gram', benchmark, repos, window_sizes, slice_sizes).search_baseline_and_ground()
    # build prompt for vanilla retrieval-augmented approach and ground truth
    tokenizer = CodexTokenizer
    mode = CONSTANTS.rg
    output_file_path = 'prompts/rg-one-gram-ws-20-ss-2.jsonl'
    BuildPromptWrapper('one-gram', benchmark, repos, window_sizes[0], slice_sizes[0], tokenizer).build_first_search_prompt(mode, output_file_path)


    # mode = CONSTANTS.gt
    # output_file_path = 'prompts/gt-one-gram-ws-20-ss-2.jsonl'
    # BuildPromptWrapper('one-gram', benchmark, repos, window_sizes, slice_sizes, tokenizer).build_first_search_prompt(mode, output_file_path)


def run_RepoCoder_method(benchmark, repos, window_sizes, slice_sizes, prediction_path):
    mode = CONSTANTS.rgrg
    MakeWindowWrapper(benchmark, repos, window_sizes, slice_sizes).window_for_prediction(mode, prediction_path)
    vectorizer = BagOfWords
    BuildVectorWrapper(benchmark, vectorizer, repos, window_sizes, slice_sizes).vectorize_prediction_windows(mode, prediction_path)
    CodeSearchWrapper('one-gram', benchmark, repos, window_sizes, slice_sizes).search_prediction(mode, prediction_path)
    tokenizer = CodexTokenizer
    output_file_path = 'prompts/repocoder-one-gram-ws-20-ss-2.jsonl'
    BuildPromptWrapper('one-gram', benchmark, repos, window_sizes, slice_sizes, tokenizer).build_prediction_prompt(mode, prediction_path, output_file_path)


if __name__ == '__main__':
    repos = [p.parts[-1] for p in Path("repos").glob("*---*") if p.is_dir()]
    window_sizes = [20]
    slice_sizes = [2]  # 20 / 2 = 10

    # build prompt for the RG1 and oracle methods
    # run_RG1_and_oracle_method(CONSTANTS.codereval_benchmark, repos, window_sizes, slice_sizes)

    # build prompt for the RepoCoder method
    prediction_path = 'predictions/rg-one-gram-ws-20-ss-2_samples.0.jsonl'
    run_RepoCoder_method(CONSTANTS.codereval_benchmark, repos, window_sizes, slice_sizes, prediction_path)