-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_package_utilities.py
116 lines (104 loc) · 5.15 KB
/
data_package_utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""File contains the utility functions used to create the data package i.e.
the resources, meta-data file (datapackage.json) and resource files """
import os
import shutil
import dataflows # pip install dataflows
import requests # pip install requests
import pandas
def load_data(source: str = None) -> dataflows.Flow:
"""Function loads the data specified by the 'source'
Parameters
- source: the url/path to load the data from
"""
# create the directory for storing the downloaded data.
try:
os.mkdir('tempdownload')
except FileExistsError:
# do nothing
pass
# get the datafile name from the provided source
data_file_name = source[source.rfind('/')+1:]
# download the data from 'source'
response = requests.get(source)
# create a filepath for where the datafile will be stored
temp_file_path = 'tempdownload/{0}'.format(data_file_name)
# write the downloaded file to disk
with open(temp_file_path, 'wb') as tempfile:
tempfile.write(response.content)
# read the excel columns containing
# dac assistance value into panda dataframe
data_frame_dac_assistance_value = pandas.read_excel(temp_file_path,
sheet_name=0,
skiprows=[0, 1, 2, 3],
usecols='A:H',
header=0)
# remove rows with any empty values
data_frame_dac_assistance_value.dropna(0, how='any',
thresh=None, inplace=True)
# rename the first column label to 'Country'
data_frame_dac_assistance_value.rename(columns={'Unnamed: 0': 'Country'},
inplace=True)
# read the excel columns containing
# dac assistance GNI percentage into panda dataframe
data_frame_dac_assistance_gni = pandas.read_excel(temp_file_path,
sheet_name=0,
skiprows=[0, 1, 2, 3],
usecols='J:Q',
header=0)
# remove rows with any empty values
data_frame_dac_assistance_gni.dropna(0, how='any',
thresh=None, inplace=True)
# rename the first column label to 'Country'
data_frame_dac_assistance_gni.rename(columns={'Unnamed: 16': 'Country'},
inplace=True)
# the name of columns have an appended ".1", fix this
# with list comprehensions
new_column_names = {column_name:
(column_name if column_name.rfind('.1') == -1
else column_name[0:column_name.rfind('.1')])
for column_name in
list(data_frame_dac_assistance_gni.columns.values)}
# rename the columns after removing the appended ".1"
data_frame_dac_assistance_gni.rename(columns=new_column_names,
inplace=True)
# reverse the column names
# to ensure 'Country' column is in front
reversed_column_names = list(data_frame_dac_assistance_gni.columns.values)
reversed_column_names.reverse()
data_frame_dac_assistance_gni \
= data_frame_dac_assistance_gni[reversed_column_names]
# convert the dataframes to csv files
data_frame_dac_assistance_value.to_csv(
temp_file_path[0: temp_file_path.rfind('.')] + '.csv',
index=False, header=True)
data_frame_dac_assistance_gni.to_csv(
temp_file_path[0: temp_file_path.rfind('.')] + 'B.csv',
index=False, header=True)
# load the data convert csv file and
# create the Flow objects used to create 'official-dac-value' resource
# and 'dac-value-gni-percentage' resource
dac_assistance_value_flow = dataflows.Flow(
dataflows.load(temp_file_path[0: temp_file_path.
rfind('.')] + '.csv',
name='official-dac-values'))
dac_assistance_gni_flow = dataflows.Flow(
dataflows.load(temp_file_path[0: temp_file_path.
rfind('.')] + 'B.csv',
name='dac-assistance-percentage-gni'))
# return the flow objects
return (dac_assistance_value_flow, dac_assistance_gni_flow)
def create_package(flows: tuple = None):
"""Function creates data package.
Data Packages are created as a folder and also a zip/archive file.
Parameters
- flows: tuple which contains the dataflows from which the
datapackage is created"""
# run the Flow object that creates the data packages
dataflows.Flow(*flows,
dataflows.dump_to_path(out_path='foreign_aid_package'),
dataflows.dump_to_zip(out_file='foreign_aid_package_zip.zip')).\
process()
def clean_up_process():
"""Functions clean up after the package making process is completed.
Specifically, it removes the temp directory and files created"""
shutil.rmtree('tempdownload')