-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathData_Loader.py
More file actions
100 lines (76 loc) · 4.22 KB
/
Data_Loader.py
File metadata and controls
100 lines (76 loc) · 4.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import torch
import random
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
#seed
random.seed(42)
class UserItemRatingDataset(Dataset):
def __init__(self, user_tensor, item_tensor, target_tensor):
self.user_tensor = user_tensor
self.item_tensor = item_tensor
self.target_tensor = target_tensor
def __getitem__(self, index):
return self.user_tensor[index], self.item_tensor[index], self.target_tensor[index]
def __len__(self):
return self.user_tensor.size(0)
# NCF를 위한 전처리 과정
class Make_Dataset(object):
def __init__(self, ratings):
#args : ratings: pd.DataFrame, which contains 5 columns = ['userId', 'train_positive', 'train_negative', 'test_rating','test_negative']
assert 'userId' in ratings.columns
assert 'train_positive' in ratings.columns
assert 'train_negative' in ratings.columns
assert 'test_rating' in ratings.columns
assert 'test_negative' in ratings.columns
self.ratings = ratings
self.positive_len = ratings["train_positive"].map(len)
self.trainset = self._trainset(ratings)
self.evaluate_data = self._evaluate_data(ratings)
def _trainset(self, ratings):
# Train Positive를 통해 Train dataset을 만든다.
user = np.array(np.repeat(ratings["userId"], self.positive_len))
item = np.array([item for items in ratings['train_positive'] for item in items])
rating = np.repeat(1, self.positive_len.sum()).reshape(-1)
print(min(item), max(item), item.shape)
print(min(user), max(user), user.shape)
return user, item, rating
def _evaluate_data(self, ratings):
# test dataset 만들기
test_user = np.array(ratings["userId"])
test_item = np.array(ratings["test_rating"])
test_negative_user = np.array(np.repeat(ratings["userId"], 99))
test_negative_item = np.array(list(ratings["test_negative"])).reshape(-1)
print(min(test_negative_item), max(test_negative_item), test_negative_item.shape)
return [torch.LongTensor(test_user), torch.LongTensor(test_item), torch.LongTensor(test_negative_user),
torch.LongTensor(test_negative_item)]
class SampleGenerator(object):
def __init__(self, user, item, rating, ratings, positive_len, num_neg):
self.user = user # 전처리한 데이터
self.item = item # 전처리한 데이터
self.rating = rating # 전처리한 데이터
self.ratings = ratings # 원본 데이터
self.num_neg = num_neg #Train Negative Ratio
self.positive_len = positive_len
self.train_user, self.train_item, self.train_rating = self.total_train(ratings, positive_len, num_neg)
def total_train(self,ratings,positive_len, num_neg):
# Train Positive + Train Negative를 합친 데이터 생성
positive_len.rename("len", inplace = True)
ratings = pd.concat([ratings,positive_len], axis = 1)
negative_user = np.array(np.repeat(ratings["userId"], ratings["len"] * num_neg)).reshape(-1)
ratings["negative_items"] = ratings.apply(lambda x : random.sample(list(x["train_negative"]), x["len"] * num_neg), axis = 1)
negative_item = np.array([item for items in ratings['negative_items'] for item in items])
negative_rating = np.repeat(0, ratings["len"].sum() * num_neg).reshape(-1)
train_user = np.hstack((self.user,negative_user))
train_item = np.hstack((self.item,negative_item))
train_rating = np.hstack((self.rating,negative_rating))
return train_user, train_item, train_rating
def instance_a_train_loader(self, batch_size):
# Train dataset을 DataLoader에 올리기 위해 사용
user = self.train_user
item = self.train_item
rating = self.train_rating
dataset = UserItemRatingDataset(user_tensor=torch.LongTensor(user),
item_tensor=torch.LongTensor(item),
target_tensor=torch.FloatTensor(rating))
return DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers = 8)