Job-Finder/mlmodel.py at main · 16A9DA/Job-Finder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from datasets import load_dataset
import pandas as pd
import re
import unicodedata
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics.pairwise import cosine_similarity
import json


def clean_text(text):
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"[@,$,&,/]", " ", text)
    text = re.sub(r"[<.*?>]", " ", text)
    text = re.sub(r"\s+", " ", text)
    text = unicodedata.normalize("NFKD", text)
    text = "".join([c for c in text if not unicodedata.combining(c)])
    return text


def load_and_clean_dataset():
    df = pd.DataFrame(
        load_dataset(
            "hanshan1988/recruitment-dataset-job-descriptions-english-sample-pt5",
            split="train",
        )
    )

    # dropping col as we do not need them
    df = df.drop(
        columns=[
            "Exp Years",
            "English Level",
            "Published",
            "Long Description_lang",
            "id",
        ]
    )

    # cleaning long descrption
    df["Long Description"] = df["Long Description"].apply(clean_text)

    # change index value to salary
    df.rename(columns={"__index_level_0__": "Salary"}, inplace=True)

    return df


def plot_skill_frequency(df):
    # Count frequency of each skill
    skill_counts = df["Primary Keyword"].value_counts()
    skills = skill_counts.index
    counts = skill_counts.values

    plt.figure(figsize=(12, 6))
    # bars must be assigned to capture them
    bars = plt.bar(skills, counts)

    plt.xticks(rotation=90)  # rotate so they dont overlap
    plt.xlabel("Skills")
    plt.ylabel("Frequency")
    plt.title("Frequency of Skills in Dataset")
    plt.show()


def prepare_train_test(df):
    X_train, X_test, Y_train, Y_test = train_test_split(
        df["Primary Keyword"], df["Position"], test_size=0.2, random_state=30
    )

    # vectorize
    tfid = TfidfVectorizer()
    # learn and apply both on training
    X_train_tfid = tfid.fit_transform(X_train)
    # apply what you trained on test data
    X_test_tfid = tfid.transform(X_test)

    return X_train, X_test, Y_train, Y_test, tfid, X_train_tfid, X_test_tfid


def train_model(X_train_tfid, Y_train):
    # as we have several different uniqure outputs, MLP would be better
    tfd_MLP = MLPClassifier()  # model
    tfd_MLP.fit(X_train_tfid, Y_train)  # learn
    return tfd_MLP


# this is additional function to help import return values of tfid and other values to use in main.py
def load_ml_model():
    # Load data
    df = load_and_clean_dataset()
    X_train, X_test, Y_train, Y_test = train_test_split(
        df["Primary Keyword"], df["Position"], test_size=0.2, random_state=30
    )

    # Vectorizer
    tfid = TfidfVectorizer()
    X_train_tfid = tfid.fit_transform(X_train)

    # Model
    tfd_MLP = MLPClassifier()
    tfd_MLP.fit(X_train_tfid, Y_train)

    return tfid, tfd_MLP, X_train_tfid, X_train, df


def predict_data(user_skills: list, tfid, tfd_MLP, X_train_tfid, X_train, df):
    company = {"Positions": [], "Companies": [], "Descriptions": [], "Salaries": []}
    for skill in user_skills:
        vec = tfid.transform(
            [skill]
        )  # trasform expects list, we give it each skills as list
        pred = tfd_MLP.predict(vec)[
            0
        ]  # output is np array we use [0] to give it as string

        similarity = cosine_similarity(
            vec, X_train_tfid
        )  # cosine checks similarity for other info we need, output 0(not)/1(similar)
        index = np.argmax(
            similarity
        )  # finds the training sample index with the highest similarity

        # append all info in company, using index find the other info on training set and append
        company["Positions"].append(pred)
        company["Companies"].append(df.iloc[X_train.index[index]]["Company Name"])
        company["Descriptions"].append(
            df.iloc[X_train.index[index]]["Long Description"]
        )
        company["Salaries"].append(
            int(df.iloc[X_train.index[index]]["Salary"])
        )  # covert to int as
        # "Object of type int64 is not JSON serializable" on fastapi

    return company


# example predict
def example_prediction(X_test_tfid, tfd_MLP):
    predicton = tfd_MLP.predict(X_test_tfid[0])  # predict
    print(predicton[0])


if __name__ == "__main__":
    df = load_and_clean_dataset()  # load data set
    plot_skill_frequency(df)  # plot key words
    X_train, X_test, Y_train, Y_test, tfid, X_train_tfid, X_test_tfid = (
        prepare_train_test(df)
    )  # train and test
    tfd_MLP = train_model(X_train_tfid, Y_train)  # MLP
    example_prediction(X_test_tfid, tfd_MLP)