-
Notifications
You must be signed in to change notification settings - Fork 134
Expand file tree
/
Copy pathstreaming_data_generator.py
More file actions
92 lines (68 loc) · 2.67 KB
/
streaming_data_generator.py
File metadata and controls
92 lines (68 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import matplotlib.pyplot as plt
# Writing a generator to load data in chunks
def read_large_file(file_object):
"""A generator to read a large file lazily!!"""
while True:
data = file_object.readline()
if not data:
break
yield data
with open('world_dev_ind.csv') as file:
gen_file = read_large_file(file)
print(next(gen_file))
print(next(gen_file))
print(next(gen_file))
# process the file line by line, to create a dictionary of the counts of how many times each country appears in a column in the dataset
counts_dict = {}
with open('world_dev_ind.csv') as file:
for line in read_large_file(file):
row = line.split(',')
first_col = row[0]
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
else:
counts_dict[first_col] = 1
print(counts_dict)
"""
df_reader = pd.read_csv('ind_pop.csv', chunksize=10)
print(next(df_reader))
print(next(df_reader))
"""
#Writing an iterator to load data in chunks
urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000)
df_urb_pop = next(urb_pop_reader)
print(df_urb_pop.head())
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']
pops = zip(df_pop_ceb['Total Population'],
df_pop_ceb['Urban population (% of total)'])
pops_list = list(pops)
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]
df_pop_ceb.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()
urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000)
data = pd.DataFrame()
for df_urb_pop in urb_pop_reader:
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']
pops = zip(df_pop_ceb['Total Population'],
df_pop_ceb['Urban population (% of total)'])
pops_list = list(pops)
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1]) for tup in pops_list]
data = data.append(df_pop_ceb)
data.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()
def plot_pop(filename, country_code):
urb_pop_reader = pd.read_csv(filename, chunksize=1000)
data = pd.DataFrame()
for df_urb_pop in urb_pop_reader:
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code]
pops = zip(df_pop_ceb['Total Population'],
df_pop_ceb['Urban population (% of total)'])
pops_list = list(pops)
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1]) for tup in pops_list]
data = data.append(df_pop_ceb)
data.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()
fn = 'ind_pop_data.csv'
plot_pop(fn, 'CEB')
plot_pop(fn, 'ARB')