-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdata_exploration.py
More file actions
121 lines (103 loc) · 3.9 KB
/
data_exploration.py
File metadata and controls
121 lines (103 loc) · 3.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#Importing libraries
import os
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
#Visualization Libraries
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
#To extract hashtags
import neattext.functions as nfx
import re
#pyspark libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import length
from pyspark.sql.functions import concat_ws, collect_list
#Creating spark session
spark = SparkSession.builder.appName("DataProcess").config('spark.ui.port', '4050').getOrCreate()
# Disabe W&B
os.environ["WANDB_DISABLED"] = "true"
#export PYSPARK_PYTHON=./venv/bin/python
# Load the dataset and display some values
Link = 'data/Train.csv'
df = pd.read_csv(Link)
#Create a PySpark DataFrame from a pandas DataFrame
# Assuming 'df' is your pandas DataFrame
for column_name, dtype in df.dtypes.items():
try:
df[column_name].astype(str)
except Exception as e:
print(f"Column '{column_name}' has dtype '{dtype}' and may be causing serialization issues: {e}")
sdf = spark.createDataFrame(df)
sdf.show()
# A way to eliminate rows containing NaN values
sdf = sdf.dropna()
sdf.show()
# We look at the number of positive, negative and neutral reviews
counts = sdf.groupBy("label").count().orderBy(col("count").desc())
counts.show()
# The count of the agrremtns
agree_count = sdf.groupBy("agreement").count().orderBy(col("agreement").desc())
agree_count.show()
# Legnth of the reviews
review_length = sdf.select(length('safe_text')).withColumnRenamed("length(safe_text)", "review_length")
review_length.show()
# Legnth of the longest review
max_length=review_length.agg({"review_length": "max"}).withColumnRenamed("max(review_length)", "max_review_length")
max_length.show()
#Legnth of the shortest review
min_length=review_length.agg({"review_length": "min"}).withColumnRenamed("max(review_length)", "max_review_length")
min_length.show()
# Length of Tweets
#text_length = df['safe_text'].apply(len)
text_length = review_length.rdd.map(lambda x: x[0])
print(text_length)
sns.histplot(text_length.collect(),kde=True, color='skyblue', edgecolor='black', linewidth=1.2)
plt.title('Distribution of Text Lengths')
plt.xlabel('Text Length')
plt.ylabel('Count')
#plt.show()
plt.savefig("DistributionofTextLengths.png")
# Distribution of Sentiments
counts_rdd = counts.rdd.map(lambda row: (row['label'], row['count']))
# Extract labels and counts
labels, count_values = zip(*counts_rdd.collect())
# Plot the distribution of counts using Matplotlib
# Define a color palette
colors = sns.color_palette('viridis', len(labels))
plt.figure()
plt.bar(labels, count_values,color=colors)
plt.title('Distribution of Sentiments')
plt.xlabel('Sentiment Label')
plt.ylabel('Count')
#plt.show()
plt.savefig("DistributionofSentiments.png")
# Distribution of Sentiments
agree_counts_rdd = agree_count.rdd.map(lambda row: (row['agreement'], row['count']))
# Extract labels and counts
labels, count_values = zip(*agree_counts_rdd.collect())
# Plot the distribution of counts using Matplotlib
# Define a color palette
colors = sns.color_palette('pastel', len(labels))
plt.figure()
plt.bar(labels, count_values,color=colors)
plt.title('Distribution of Agreement Percentages')
plt.xlabel('Agreement Percentage')
plt.ylabel('Frequency')
#plt.show()
plt.savefig("DistributionofAgreementPercentages.png")
# Concatenate all text from the 'safe_text' column into a single string
text_df = sdf.agg(concat_ws(' ', collect_list('safe_text')).alias('text'))
# Extract the concatenated text
text = text_df.first()['text']
# Generate the word cloud with a white background
cloud_two_cities = WordCloud(width=800, height=400, background_color='white').generate(text)
# Display the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(cloud_two_cities, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=1)
#plt.show()
plt.savefig("WordCloud.png")