datacamp/streaming_data_generator.py at master · johnashu/datacamp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import matplotlib.pyplot as plt

# Writing a generator to load data in chunks
def read_large_file(file_object):
    """A generator to read a large file lazily!!"""
    while True:
        data = file_object.readline()
        if not data:
            break
        yield data


with open('world_dev_ind.csv') as file:
    gen_file = read_large_file(file)
    print(next(gen_file))
    print(next(gen_file))
    print(next(gen_file))

# process the file line by line, to create a dictionary of the counts of how many times each country appears in a column in the dataset

counts_dict = {}

with open('world_dev_ind.csv') as file:
    for line in read_large_file(file):
        row = line.split(',')
        first_col = row[0]
        if first_col in counts_dict.keys():
            counts_dict[first_col] += 1
        else:
            counts_dict[first_col] = 1

print(counts_dict)


"""
df_reader = pd.read_csv('ind_pop.csv', chunksize=10)
print(next(df_reader))
print(next(df_reader))
"""
#Writing an iterator to load data in chunks

urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000)
df_urb_pop = next(urb_pop_reader)
print(df_urb_pop.head())

df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']
pops = zip(df_pop_ceb['Total Population'],
           df_pop_ceb['Urban population (% of total)'])
pops_list = list(pops)
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]

df_pop_ceb.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()


urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000)
data = pd.DataFrame()

for df_urb_pop in urb_pop_reader:
    df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']
    pops = zip(df_pop_ceb['Total Population'],
                df_pop_ceb['Urban population (% of total)'])
    pops_list = list(pops)
    df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1]) for tup in pops_list]
    data = data.append(df_pop_ceb)

data.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()


def plot_pop(filename, country_code):
    urb_pop_reader = pd.read_csv(filename, chunksize=1000)
    data = pd.DataFrame()

    for df_urb_pop in urb_pop_reader:
        df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code]
        pops = zip(df_pop_ceb['Total Population'],
                    df_pop_ceb['Urban population (% of total)'])
        pops_list = list(pops)

        df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1]) for tup in pops_list]
        data = data.append(df_pop_ceb)

        data.plot(kind='scatter', x='Year', y='Total Urban Population')
        plt.show()
fn = 'ind_pop_data.csv'

plot_pop(fn, 'CEB')
plot_pop(fn, 'ARB')