-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
51 lines (40 loc) · 1.64 KB
/
utils.py
File metadata and controls
51 lines (40 loc) · 1.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import PyPDF2
import re
from transformers import AutoTokenizer
import yake
tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
def extract_text_from_pdf(pdf_file):
reader = PyPDF2.PdfReader(pdf_file)
full_text = ""
for page in reader.pages:
full_text += page.extract_text() or ""
return full_text.strip()
def simple_sentence_splitter(text):
return re.split(r'(?<=[.!?])\s+', text)
def chunk_text(text, max_tokens=800):
sentences = simple_sentence_splitter(text)
chunks = []
current_chunk = ""
current_tokens = 0
for sentence in sentences:
sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
token_len = len(sentence_tokens)
if token_len > max_tokens:
sentence_tokens = sentence_tokens[:max_tokens]
sentence = tokenizer.decode(sentence_tokens, skip_special_tokens=True)
token_len = len(sentence_tokens)
if current_tokens + token_len <= max_tokens:
current_chunk += " " + sentence
current_tokens += token_len
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence
current_tokens = token_len
if current_chunk:
chunks.append(current_chunk.strip())
return [chunk for chunk in chunks if len(tokenizer.encode(chunk)) <= max_tokens]
def extract_keywords(text, max_keywords=20):
kw_extractor = yake.KeywordExtractor(lan="en", n=1, top=max_keywords)
keywords = kw_extractor.extract_keywords(text)
return [kw for kw, _ in keywords]