-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
78 lines (69 loc) · 2.62 KB
/
scraper.py
File metadata and controls
78 lines (69 loc) · 2.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import jsonlines as jsonl
def write_jsonl(lst):
with jsonl.open('transcripts.jsonl', 'w') as file:
file.write_all(lst)
def clean(text):
terms_to_check = ['announcer', 'applaud', 'applause', 'audience']
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
filtered_sentences = [sentence for sentence in sentences if not any(term in sentence.lower() for term in terms_to_check)]
result = ' '.join(filtered_sentences)
result = re.sub(r'\s+', ' ', result).strip()
pat = re.compile("[♪]")
res = pat.sub('', result)
return res
def input(link):
if not link: # Skip processing if the URL is empty
print("Skipping empty URL.")
return None
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# Introduce a delay between requests
time.sleep(1)
with requests.Session() as session:
res = session.get(link, headers=headers)
if res.is_redirect:
print("Redirecting to:", res.url)
doc = BeautifulSoup(res.content, "html.parser")
input_elements = doc.find_all("div", class_="elementor-element") #intro/context of the transcript
if len(input_elements) > 19:
title = input_elements[14].text
title = title.replace(" | Transcript ", "")
title = title.split('(')[0].strip()
title = title.strip()
intro = input_elements[16].text
intro = intro.strip()
artist = title.split(':')[0].strip()
title = title.split(':')[1].strip()
if len(intro) == 0:
intro = ''
con = input_elements[18].text.split(' * * * ')
else:
con = input_elements[19].text.split(' * * * ')
if len(con) == 2:
context = con[0]
juice = con[1]
else:
context = ''
juice = con
st = "".join(juice)
a = clean(st)
a = a.strip()
print(title)
#text.append(a)
return {'Title': title, 'Artist' : artist, 'Intro': intro, 'context': context, 'Text': a}
else:
print(f"Insufficient elements found in the HTML for {link}")
return None
lst = []
file = 'urls.txt'
with open(file, 'r') as f:
for i in f:
op = input(i.strip()) # Remove leading/trailing whitespaces
lst.append(op)
write_jsonl(lst)