-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_normalizer.py
More file actions
executable file
·55 lines (42 loc) · 1.87 KB
/
run_normalizer.py
File metadata and controls
executable file
·55 lines (42 loc) · 1.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import pandas as pd
import json
from utils.utils import load_config
from src.normalizer import NameNormalizer, EntityLinker
def is_in_skip_range(canto, stanza):
"""Filters genealogical stanzas specific to Orlando Furioso."""
if canto == 3 and 22 <= stanza <= 62: return True
if canto == 10 and 77 <= stanza <= 89: return True
return False
def save_json(data, path):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'wb') as f:
f.write(json.dumps(data, indent=4, ensure_ascii=False).encode('utf-8'))
def main():
cfg = load_config("config/settings.yaml")
n_cfg = cfg.get('normalization', {})
target_path = n_cfg.get('target_path')
threshold = n_cfg.get('similarity_threshold', 0.85)
mention_threshold = n_cfg.get('mention_threshold', 0.90)
neighbor_count = n_cfg.get('neighbor_count', 5)
bert_path = "output/NER/bert-italian-cased-ner_C1-O1_to_C12-O94_NER.csv"
if not target_path or not os.path.exists(target_path):
print(f"Error: Target path '{target_path}' not found.")
return
filename_base = os.path.basename(target_path).replace('.csv', '')
normalizer = NameNormalizer(threshold=threshold)
linker = EntityLinker(
name_threshold=threshold,
mention_threshold=mention_threshold,
neighbor_count=neighbor_count
)
df_raw = pd.read_csv(target_path)
df_raw = df_raw[~df_raw.apply(lambda x: is_in_skip_range(x['canto'], x['stanza']), axis=1)].copy()
df_plain = df_raw.copy()
df_plain = normalizer.fit_and_normalize(df_plain)
plain_out_dir = "output/normalization"
os.makedirs(f"{plain_out_dir}/logs", exist_ok=True)
df_plain.to_csv(f"{plain_out_dir}/{filename_base}_normalized.csv", index=False)
save_json(normalizer.log_data, f"{plain_out_dir}/logs/log_{filename_base}.json")
if __name__ == "__main__":
main()