BoxE/TestFilteredSubset.py at master · ralphabb/BoxE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import numpy as np
from KBUtils import load_kb_file
from KBUtils import MissingDict
from NELLProcessing import materialise_simple_hierarchy
import Cnst

test_facts = load_kb_file("Other/DatasetsMulti/NELLRuleInjSplit90Mat/test.kb")
ts_facts_reform = np.zeros_like(test_facts)
ts_facts_reform[:, 0] = test_facts[:, 1]
ts_facts_reform[:, 1] = test_facts[:, 0]
ts_facts_reform[:, 2] = test_facts[:, 2]
ts_facts_reform[:, 3] = test_facts[:, 3]
train_facts = load_kb_file("Other/DatasetsMulti/NELLRuleInjSplit90Mat/train.kb")

print(test_facts.shape)

tr_facts_reform = np.zeros_like(train_facts)
print(tr_facts_reform.shape)
tr_facts_reform[:, 0] = train_facts[:, 1]
tr_facts_reform[:, 1] = train_facts[:, 0]
tr_facts_reform[:, 2] = train_facts[:, 2]
tr_facts_reform[:, 3] = train_facts[:, 3]

print(tr_facts_reform.shape)
mat_tr_facts, new_mats = materialise_simple_hierarchy(tr_facts_reform, "RulesNELL.txt")
in_training_mat = MissingDict(lambda: False)
for triple in mat_tr_facts:
    in_training_mat[tuple(triple)] = True

in_test_mat = MissingDict(lambda: False)
for triple in tr_facts_reform:
    in_test_mat[tuple(triple)] = True


new_mat_triples = np.array([triple for triple in ts_facts_reform if not in_training_mat[tuple(triple)]])
print(new_mat_triples.shape)
new_mat_triples_ref = np.zeros_like(new_mat_triples)
new_mat_triples_ref[:, 0] = new_mat_triples[:, 1]
new_mat_triples_ref[:, 1] = new_mat_triples[:, 0]
new_mat_triples_ref[:, 2] = new_mat_triples[:, 2]
new_mat_triples_ref[:, 3] = new_mat_triples[:, 3]


np.random.seed(Cnst.DEFAULT_RANDOM_SEED)
np.random.shuffle(new_mat_triples_ref)
number_of_splits = int(np.ceil(new_mat_triples_ref.shape[0] / 15000))
batches = np.array_split(new_mat_triples_ref, number_of_splits)
separated_batches = []
for batch in batches:
    separated_batches.append(batch)
print(len(separated_batches))

in_final_mat = MissingDict(lambda: False)
for triple in new_mat_triples_ref:
    in_final_mat[tuple(triple)] = True

remaining_test = np.array([triple for triple in test_facts if not in_final_mat[tuple(triple)]])
np.save("Other/DatasetsMulti/NELLRuleInjSplit90Mat/test_subset.kbb", separated_batches)

number_of_splits = int(np.ceil(remaining_test.shape[0] / 15000))
batches_2 = np.array_split(remaining_test, number_of_splits)
for batch in batches_2:
    separated_batches.append(batch)
np.save("Other/DatasetsMulti/NELLRuleInjSplit90Mat/test_subset_all.kbb", separated_batches)
print(len(separated_batches))