From 738a061e532621b726c58049aec5ba894dc5f80a Mon Sep 17 00:00:00 2001 From: jwinik <67568423+jwinik@users.noreply.github.com> Date: Tue, 20 Jul 2021 11:38:43 -0500 Subject: [PATCH 01/10] Initial commit --- .../cli_017_rw1_glacier_locations.py | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py diff --git a/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py b/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py new file mode 100644 index 00000000..b7935b05 --- /dev/null +++ b/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py @@ -0,0 +1,114 @@ + +import pandas as pd +import geopandas as gpd +import numpy as np +import urllib +import glob +import io +import requests +import json +import os +import sys +import tabula +import dotenv +dotenv.load_dotenv('C:\\Users\\Jason.Winik\\OneDrive - World Resources Institute\\Documents\\GitHub\\cred\\.env') +utils_path = os.path.join(os.path.abspath(os.getenv('PROCESSING_DIR')),'utils') +if utils_path not in sys.path: + sys.path.append(utils_path) +gdal_path = os.getenv('GDAL_DIR') +if gdal_path not in sys.path: + sys.path.append(gdal_path) +import util_files +import util_cloud +import util_carto +import logging +from zipfile import ZipFile + +# Set up logging +# Get the top-level logger object +logger = logging.getLogger() +for handler in logger.handlers: logger.removeHandler(handler) +logger.setLevel(logging.INFO) +# make it print to the console. +console = logging.StreamHandler() +logger.addHandler(console) +logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') + +# name of table on Carto where you want to upload data +# this should be a table name that is not currently in use +dataset_name = 'cli_017_rw1_glacier_locations' #check + +logger.info('Executing script for dataset: ' + dataset_name) +# create a new sub-directory within your specified dir called 'data' +# within this directory, create files to store raw and processed data +data_dir = util_files.prep_dirs(dataset_name) + +''' +Download data and save to your data directory +''' + +# download the data from the source +url = "https://www.glims.org/download/latest" +raw_data_file = os.path.join(data_dir,os.path.basename(url)+'.zip') +r = urllib.request.urlretrieve(url, raw_data_file) +# unzip source data +raw_data_file_unzipped = raw_data_file.split('.')[0] +zip_ref = ZipFile(raw_data_file, 'r') +zip_ref.extractall(raw_data_file_unzipped) +zip_ref.close() + +''' +Process Data +''' +#need polygon and point + +# load in the polygon shapefile +shapefile = glob.glob(os.path.join(raw_data_file_unzipped, 'glims_points.shp', 'glims_polygon.shp')) +gdf = gpd.read_file(shapefile) + +# convert the data type of column 'PROTECT', 'PROTECT_FE', and 'METADATA_I' to integer +gdf['PROTECT'] = gdf['PROTECT'].astype(int) +gdf['PROTECT_FE'] = gdf['PROTECT_FE'].astype(int) +gdf['METADATA_I'] = gdf['METADATA_I'].astype(int) + +# create a path to save the processed shapefile later +processed_data_file = os.path.join(data_dir, dataset_name+'_edit.shp') +# create an index column to use as cartodb_id +gdf['cartodb_id'] = gdf.index + +# reorder the columns +gdf = gdf[['cartodb_id'] + list(gdf)[:-1]] + +# save processed dataset to shapefile +gdf.to_file(processed_data_file,driver='ESRI Shapefile') + +''' +Upload processed data to Carto +''' +logger.info('Uploading processed data to Carto.') +util_carto.upload_to_carto(processed_data_file, 'LINK') + +''' +Upload original data and processed data to Amazon S3 storage +''' +# initialize AWS variables +aws_bucket = 'wri-public-data' +s3_prefix = 'resourcewatch/' + +logger.info('Uploading original data to S3.') +# Upload raw data file to S3 + +# Copy the raw data into a zipped file to upload to S3 +raw_data_dir = os.path.join(data_dir, dataset_name+'.zip') +with ZipFile(raw_data_dir,'w') as zip: + zip.write(raw_data_file, os.path.basename(raw_data_file)) +#Upload raw data file to S3 +uploaded = util_cloud.aws_upload(raw_data_dir, aws_bucket, s3_prefix+os.path.basename(raw_data_dir)) + +logger.info('Uploading processed data to S3.') +# Copy the processed data into a zipped file to upload to S3 +processed_data_dir = os.path.join(data_dir, dataset_name+'_edit.zip') +with ZipFile(processed_data_dir,'w') as zip: + zip.write(processed_data_file, os.path.basename(processed_data_file)) +# Upload processed data file to S3 +uploaded = util_cloud.aws_upload(processed_data_dir, aws_bucket, s3_prefix+os.path.basename(processed_data_dir)) \ No newline at end of file From 946e0fcd17b80ba9690648a9ad78fd240c13051d Mon Sep 17 00:00:00 2001 From: jwinik <67568423+jwinik@users.noreply.github.com> Date: Thu, 22 Jul 2021 15:27:23 -0500 Subject: [PATCH 02/10] renamed columns in both shapefiles --- .../cli_017_rw1_glacier_locations.py | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py b/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py index b7935b05..661fe04a 100644 --- a/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py +++ b/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py @@ -9,7 +9,6 @@ import json import os import sys -import tabula import dotenv dotenv.load_dotenv('C:\\Users\\Jason.Winik\\OneDrive - World Resources Institute\\Documents\\GitHub\\cred\\.env') utils_path = os.path.join(os.path.abspath(os.getenv('PROCESSING_DIR')),'utils') @@ -46,7 +45,6 @@ ''' Download data and save to your data directory ''' - # download the data from the source url = "https://www.glims.org/download/latest" raw_data_file = os.path.join(data_dir,os.path.basename(url)+'.zip') @@ -63,30 +61,40 @@ #need polygon and point # load in the polygon shapefile -shapefile = glob.glob(os.path.join(raw_data_file_unzipped, 'glims_points.shp', 'glims_polygon.shp')) -gdf = gpd.read_file(shapefile) +#points = os.path.abspath('glims_points.shp') +#polygon = os.path.abspath('glims_polygon.shp') +shapefile = glob.glob(os.path.join(raw_data_file_unzipped,'glims_download_82381', 'glims_p*.shp')) +gdf_points = gpd.read_file(shapefile[0]) +gdf_extent = gpd.read_file(shapefile[1]) + +#rename columns points +gdf_points.columns = ['the_geom' if x == 'geometry' else x for x in gdf_points.columns] + +#rename columns extent +extent_col_change = {'length': 'glacier_length', 'geometry': 'the_geom'} +gdf_extent.columns = [extent_col_change.get(x,x) for x in gdf_extent.columns] -# convert the data type of column 'PROTECT', 'PROTECT_FE', and 'METADATA_I' to integer -gdf['PROTECT'] = gdf['PROTECT'].astype(int) -gdf['PROTECT_FE'] = gdf['PROTECT_FE'].astype(int) -gdf['METADATA_I'] = gdf['METADATA_I'].astype(int) +#remove excess extent columns +columns_to_remove = ['loc_unc_x', 'loc_unc_y', 'glob_unc_x', 'glob_unc_y'] +gdf_extent = gdf_extent.drop(columns_to_remove,axis = 1) -# create a path to save the processed shapefile later -processed_data_file = os.path.join(data_dir, dataset_name+'_edit.shp') -# create an index column to use as cartodb_id -gdf['cartodb_id'] = gdf.index +#create new field -# reorder the columns -gdf = gdf[['cartodb_id'] + list(gdf)[:-1]] # save processed dataset to shapefile -gdf.to_file(processed_data_file,driver='ESRI Shapefile') +processed_data_points = os.path.join(data_dir, dataset_name +'_locations.shp') +gdf_points.to_file(processed_data_points,driver='ESRI Shapefile') + +processed_data_extent = os.path.join(data_dir, dataset_name +'_extent.shp') +gdf_extent.to_file(processed_data_extent,driver='ESRI Shapefile') + +processed_files = [processed_data_extent, processed_data_points] ''' Upload processed data to Carto ''' logger.info('Uploading processed data to Carto.') -util_carto.upload_to_carto(processed_data_file, 'LINK') +util_carto.upload_to_carto(processed_files, 'LINK') ''' Upload original data and processed data to Amazon S3 storage From 78b955228c1a5b5d01f0d8a166861677001a9d5a Mon Sep 17 00:00:00 2001 From: jwinik <67568423+jwinik@users.noreply.github.com> Date: Fri, 23 Jul 2021 09:56:59 -0500 Subject: [PATCH 03/10] updated readme --- cli_017_rw1_glacier_locations/README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 cli_017_rw1_glacier_locations/README.md diff --git a/cli_017_rw1_glacier_locations/README.md b/cli_017_rw1_glacier_locations/README.md new file mode 100644 index 00000000..319f01c8 --- /dev/null +++ b/cli_017_rw1_glacier_locations/README.md @@ -0,0 +1,19 @@ +## {Resource Watch Public Title} Dataset Pre-processing +This file describes the data pre-processing that was done to [the Glacier Locations](http://glims.colorado.edu/glacierdata/) for [display on Resource Watch](https://resourcewatch.org/data/explore/cli017-Glacier-Extents_replacement?section=All+data&selectedCollection=&zoom=3&lat=0&lng=0&pitch=0&bearing=0&basemap=dark&labels=light&layers=%255B%257B%2522dataset%2522%253A%2522ad218d82-058b-4b8e-b790-44fb6d4b531f%2522%252C%2522opacity%2522%253A1%252C%2522layer%2522%253A%25221ab0f13b-b3cf-46fb-add5-2b802df9a9eb%2522%257D%255D&aoi=&page=1&sort=most-viewed&sortDirection=-1&topics=%255B%2522glacier%2522%255D). + +The source provided the two shapefiles in zipped folder. + +Below, we describe the steps used to download the shapefiles and format them to upload to Carto. + +1. Download the zipped folder +``` +Include any SQL or GEE code you used in a code snippet. +``` + +Please see the [Python script](https://github.com/resource-watch/data-pre-processing/tree/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations) for more details on this processing. + +You can view the processed Glacier Locations dataset [on Resource Watch](https://resourcewatch.org/data/explore/cli017-Glacier-Extents_replacement?section=All+data&selectedCollection=&zoom=3&lat=0&lng=0&pitch=0&bearing=0&basemap=dark&labels=light&layers=%255B%257B%2522dataset%2522%253A%2522ad218d82-058b-4b8e-b790-44fb6d4b531f%2522%252C%2522opacity%2522%253A1%252C%2522layer%2522%253A%25221ab0f13b-b3cf-46fb-add5-2b802df9a9eb%2522%257D%255D&aoi=&page=1&sort=most-viewed&sortDirection=-1&topics=%255B%2522glacier%2522%255D). + +You can also download the original dataset [directly through Resource Watch](https://wri-public-data.s3.amazonaws.com/resourcewatch/cli_017_glacier_extent.zip), or [from the source website](http://www.glims.org/download/). + +###### Note: This dataset processing was done by [Jason Winik](https://www.wri.org/profile/jason-winik), and QC'd by [{name}]({link to WRI bio page}). From 574de8464fa7466858b721a197c5d90184d6ad17 Mon Sep 17 00:00:00 2001 From: jwinik <67568423+jwinik@users.noreply.github.com> Date: Tue, 3 Aug 2021 13:41:10 -0500 Subject: [PATCH 04/10] correcting S3 upload --- .../cli_017_rw1_glacier_locations.py | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py b/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py index 661fe04a..45d14dc5 100644 --- a/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py +++ b/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py @@ -1,22 +1,13 @@ - import pandas as pd import geopandas as gpd -import numpy as np import urllib import glob -import io import requests -import json import os import sys -import dotenv -dotenv.load_dotenv('C:\\Users\\Jason.Winik\\OneDrive - World Resources Institute\\Documents\\GitHub\\cred\\.env') utils_path = os.path.join(os.path.abspath(os.getenv('PROCESSING_DIR')),'utils') if utils_path not in sys.path: sys.path.append(utils_path) -gdal_path = os.getenv('GDAL_DIR') -if gdal_path not in sys.path: - sys.path.append(gdal_path) import util_files import util_cloud import util_carto @@ -58,11 +49,7 @@ ''' Process Data ''' -#need polygon and point - # load in the polygon shapefile -#points = os.path.abspath('glims_points.shp') -#polygon = os.path.abspath('glims_polygon.shp') shapefile = glob.glob(os.path.join(raw_data_file_unzipped,'glims_download_82381', 'glims_p*.shp')) gdf_points = gpd.read_file(shapefile[0]) gdf_extent = gpd.read_file(shapefile[1]) @@ -78,8 +65,9 @@ columns_to_remove = ['loc_unc_x', 'loc_unc_y', 'glob_unc_x', 'glob_unc_y'] gdf_extent = gdf_extent.drop(columns_to_remove,axis = 1) -#create new field - +#set the geometry of gdf_points and gdf_extent +gdf_points = gdf_points.set_geometry('the_geom') +gdf_extent = gdf_extent.set_geometry('the_geom') # save processed dataset to shapefile processed_data_points = os.path.join(data_dir, dataset_name +'_locations.shp') @@ -94,7 +82,8 @@ Upload processed data to Carto ''' logger.info('Uploading processed data to Carto.') -util_carto.upload_to_carto(processed_files, 'LINK') +util_carto.upload_to_carto(processed_data_points, 'LINK') +util_carto.upload_to_carto(processed_data_extent, 'LINK') ''' Upload original data and processed data to Amazon S3 storage @@ -117,6 +106,6 @@ # Copy the processed data into a zipped file to upload to S3 processed_data_dir = os.path.join(data_dir, dataset_name+'_edit.zip') with ZipFile(processed_data_dir,'w') as zip: - zip.write(processed_data_file, os.path.basename(processed_data_file)) + zip.write(processed_files, os.path.basename(processed_files)) # Upload processed data file to S3 uploaded = util_cloud.aws_upload(processed_data_dir, aws_bucket, s3_prefix+os.path.basename(processed_data_dir)) \ No newline at end of file From 6a021848a40c86575aa931a0855f3a33fe24456c Mon Sep 17 00:00:00 2001 From: jwinik <67568423+jwinik@users.noreply.github.com> Date: Tue, 3 Aug 2021 13:45:46 -0500 Subject: [PATCH 05/10] updated READme --- cli_017_rw1_glacier_locations/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cli_017_rw1_glacier_locations/README.md b/cli_017_rw1_glacier_locations/README.md index 319f01c8..a3392c97 100644 --- a/cli_017_rw1_glacier_locations/README.md +++ b/cli_017_rw1_glacier_locations/README.md @@ -6,6 +6,9 @@ The source provided the two shapefiles in zipped folder. Below, we describe the steps used to download the shapefiles and format them to upload to Carto. 1. Download the zipped folder +2. Rename columns to match Carto table and delete unnecessary columns +3. Reuplod to Carto and Resource Watch + ``` Include any SQL or GEE code you used in a code snippet. ``` From 1b11c63abfbf8457056aea526b70601c4f9f058d Mon Sep 17 00:00:00 2001 From: jwinik <67568423+jwinik@users.noreply.github.com> Date: Tue, 3 Aug 2021 13:48:05 -0500 Subject: [PATCH 06/10] Added reviewer bio --- cli_017_rw1_glacier_locations/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli_017_rw1_glacier_locations/README.md b/cli_017_rw1_glacier_locations/README.md index a3392c97..d7032b6f 100644 --- a/cli_017_rw1_glacier_locations/README.md +++ b/cli_017_rw1_glacier_locations/README.md @@ -19,4 +19,4 @@ You can view the processed Glacier Locations dataset [on Resource Watch](https:/ You can also download the original dataset [directly through Resource Watch](https://wri-public-data.s3.amazonaws.com/resourcewatch/cli_017_glacier_extent.zip), or [from the source website](http://www.glims.org/download/). -###### Note: This dataset processing was done by [Jason Winik](https://www.wri.org/profile/jason-winik), and QC'd by [{name}]({link to WRI bio page}). +###### Note: This dataset processing was done by [Jason Winik](https://www.wri.org/profile/jason-winik), and QC'd by [Amelia Snyder](https://www.wri.org/profile/amelia-snyder). From 5920be60013e37fd8f82162e43ba9f2e05352e34 Mon Sep 17 00:00:00 2001 From: jwinik <67568423+jwinik@users.noreply.github.com> Date: Tue, 3 Aug 2021 13:51:52 -0500 Subject: [PATCH 07/10] Updated methodology --- cli_017_rw1_glacier_locations/README.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cli_017_rw1_glacier_locations/README.md b/cli_017_rw1_glacier_locations/README.md index d7032b6f..7df32104 100644 --- a/cli_017_rw1_glacier_locations/README.md +++ b/cli_017_rw1_glacier_locations/README.md @@ -1,13 +1,16 @@ ## {Resource Watch Public Title} Dataset Pre-processing This file describes the data pre-processing that was done to [the Glacier Locations](http://glims.colorado.edu/glacierdata/) for [display on Resource Watch](https://resourcewatch.org/data/explore/cli017-Glacier-Extents_replacement?section=All+data&selectedCollection=&zoom=3&lat=0&lng=0&pitch=0&bearing=0&basemap=dark&labels=light&layers=%255B%257B%2522dataset%2522%253A%2522ad218d82-058b-4b8e-b790-44fb6d4b531f%2522%252C%2522opacity%2522%253A1%252C%2522layer%2522%253A%25221ab0f13b-b3cf-46fb-add5-2b802df9a9eb%2522%257D%255D&aoi=&page=1&sort=most-viewed&sortDirection=-1&topics=%255B%2522glacier%2522%255D). -The source provided the two shapefiles in zipped folder. +The source provided the two shapefiles in a zipped folder. + +1. Glims Glacier Locations (points) +2. Glims Glaceir Extent (polygons) Below, we describe the steps used to download the shapefiles and format them to upload to Carto. -1. Download the zipped folder -2. Rename columns to match Carto table and delete unnecessary columns -3. Reuplod to Carto and Resource Watch +1. Download the zipped folder and import the shapefiles as geopandas dataframes. +2. Rename columns to match Carto table and delete unnecessary columns. +3. Reuplod to Carto and Resource Watch. ``` Include any SQL or GEE code you used in a code snippet. From 6c7f62a3e500abbfb6246438d216857e18451d78 Mon Sep 17 00:00:00 2001 From: jwinik <67568423+jwinik@users.noreply.github.com> Date: Thu, 19 Aug 2021 10:42:23 -0400 Subject: [PATCH 08/10] Apply suggestions from code review Co-authored-by: adawyj97 <49011074+adawyj97@users.noreply.github.com> --- cli_017_rw1_glacier_locations/README.md | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/cli_017_rw1_glacier_locations/README.md b/cli_017_rw1_glacier_locations/README.md index 7df32104..80a2fba4 100644 --- a/cli_017_rw1_glacier_locations/README.md +++ b/cli_017_rw1_glacier_locations/README.md @@ -1,25 +1,21 @@ ## {Resource Watch Public Title} Dataset Pre-processing This file describes the data pre-processing that was done to [the Glacier Locations](http://glims.colorado.edu/glacierdata/) for [display on Resource Watch](https://resourcewatch.org/data/explore/cli017-Glacier-Extents_replacement?section=All+data&selectedCollection=&zoom=3&lat=0&lng=0&pitch=0&bearing=0&basemap=dark&labels=light&layers=%255B%257B%2522dataset%2522%253A%2522ad218d82-058b-4b8e-b790-44fb6d4b531f%2522%252C%2522opacity%2522%253A1%252C%2522layer%2522%253A%25221ab0f13b-b3cf-46fb-add5-2b802df9a9eb%2522%257D%255D&aoi=&page=1&sort=most-viewed&sortDirection=-1&topics=%255B%2522glacier%2522%255D). -The source provided the two shapefiles in a zipped folder. +The source provided the dataset as two shapefiles in a zipped folder. 1. Glims Glacier Locations (points) 2. Glims Glaceir Extent (polygons) -Below, we describe the steps used to download the shapefiles and format them to upload to Carto. +Below, we describe the steps used to reformat the shapefiles to upload to Carto. -1. Download the zipped folder and import the shapefiles as geopandas dataframes. -2. Rename columns to match Carto table and delete unnecessary columns. -3. Reuplod to Carto and Resource Watch. +1. Import the shapefiles as geopandas dataframes. +2. Rename columns to match the current version of Carto table. -``` -Include any SQL or GEE code you used in a code snippet. -``` -Please see the [Python script](https://github.com/resource-watch/data-pre-processing/tree/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations) for more details on this processing. +Please see the [Python script](https://github.com/resource-watch/data-pre-processing/tree/master/cli_017_rw2_glacier_locations/cli_017_rw2_glacier_locations_processing.py) for more details on this processing. -You can view the processed Glacier Locations dataset [on Resource Watch](https://resourcewatch.org/data/explore/cli017-Glacier-Extents_replacement?section=All+data&selectedCollection=&zoom=3&lat=0&lng=0&pitch=0&bearing=0&basemap=dark&labels=light&layers=%255B%257B%2522dataset%2522%253A%2522ad218d82-058b-4b8e-b790-44fb6d4b531f%2522%252C%2522opacity%2522%253A1%252C%2522layer%2522%253A%25221ab0f13b-b3cf-46fb-add5-2b802df9a9eb%2522%257D%255D&aoi=&page=1&sort=most-viewed&sortDirection=-1&topics=%255B%2522glacier%2522%255D). +You can view the processed Glacier Locations dataset [on Resource Watch](link to new dataset). You can also download the original dataset [directly through Resource Watch](https://wri-public-data.s3.amazonaws.com/resourcewatch/cli_017_glacier_extent.zip), or [from the source website](http://www.glims.org/download/). -###### Note: This dataset processing was done by [Jason Winik](https://www.wri.org/profile/jason-winik), and QC'd by [Amelia Snyder](https://www.wri.org/profile/amelia-snyder). +###### Note: This dataset processing was done by [Jason Winik](https://www.wri.org/profile/jason-winik), and QC'd by [Yujing Wu](https://www.wri.org/profile/yujing-wu). From 67edfd9f42e56ef88f80db3424732781741f0a3b Mon Sep 17 00:00:00 2001 From: jwinik <67568423+jwinik@users.noreply.github.com> Date: Thu, 19 Aug 2021 11:01:04 -0400 Subject: [PATCH 09/10] Apply suggestions from code review Co-authored-by: adawyj97 <49011074+adawyj97@users.noreply.github.com> --- cli_017_rw1_glacier_locations/README.md | 2 +- .../cli_017_rw1_glacier_locations.py | 45 +++++++++++-------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/cli_017_rw1_glacier_locations/README.md b/cli_017_rw1_glacier_locations/README.md index 80a2fba4..873f32ad 100644 --- a/cli_017_rw1_glacier_locations/README.md +++ b/cli_017_rw1_glacier_locations/README.md @@ -1,5 +1,5 @@ ## {Resource Watch Public Title} Dataset Pre-processing -This file describes the data pre-processing that was done to [the Glacier Locations](http://glims.colorado.edu/glacierdata/) for [display on Resource Watch](https://resourcewatch.org/data/explore/cli017-Glacier-Extents_replacement?section=All+data&selectedCollection=&zoom=3&lat=0&lng=0&pitch=0&bearing=0&basemap=dark&labels=light&layers=%255B%257B%2522dataset%2522%253A%2522ad218d82-058b-4b8e-b790-44fb6d4b531f%2522%252C%2522opacity%2522%253A1%252C%2522layer%2522%253A%25221ab0f13b-b3cf-46fb-add5-2b802df9a9eb%2522%257D%255D&aoi=&page=1&sort=most-viewed&sortDirection=-1&topics=%255B%2522glacier%2522%255D). +This file describes the data pre-processing that was done to [the Glacier Locations](http://glims.colorado.edu/glacierdata/) for [display on Resource Watch](put in the link for the new RW dataset). The source provided the dataset as two shapefiles in a zipped folder. diff --git a/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py b/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py index 45d14dc5..b01cd72f 100644 --- a/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py +++ b/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py @@ -49,31 +49,24 @@ ''' Process Data ''' -# load in the polygon shapefile +# find the file paths to the shapefiles shapefile = glob.glob(os.path.join(raw_data_file_unzipped,'glims_download_82381', 'glims_p*.shp')) +# read in the point shapefile as a geopandas dataframe gdf_points = gpd.read_file(shapefile[0]) +# read in the extent shapefile as a geopandas dataframe gdf_extent = gpd.read_file(shapefile[1]) -#rename columns points -gdf_points.columns = ['the_geom' if x == 'geometry' else x for x in gdf_points.columns] -#rename columns extent -extent_col_change = {'length': 'glacier_length', 'geometry': 'the_geom'} -gdf_extent.columns = [extent_col_change.get(x,x) for x in gdf_extent.columns] +#rename the columns of the polygon shapefile +gdf_extent.columns = ['glacier_length' if x == 'length' else x for x in gdf_extent.columns] -#remove excess extent columns -columns_to_remove = ['loc_unc_x', 'loc_unc_y', 'glob_unc_x', 'glob_unc_y'] -gdf_extent = gdf_extent.drop(columns_to_remove,axis = 1) -#set the geometry of gdf_points and gdf_extent -gdf_points = gdf_points.set_geometry('the_geom') -gdf_extent = gdf_extent.set_geometry('the_geom') # save processed dataset to shapefile -processed_data_points = os.path.join(data_dir, dataset_name +'_locations.shp') +processed_data_points = os.path.join(data_dir, dataset_name +'_locations_edit.shp') gdf_points.to_file(processed_data_points,driver='ESRI Shapefile') -processed_data_extent = os.path.join(data_dir, dataset_name +'_extent.shp') +processed_data_extent = os.path.join(data_dir, dataset_name +'_extent_edit.shp') gdf_extent.to_file(processed_data_extent,driver='ESRI Shapefile') processed_files = [processed_data_extent, processed_data_points] @@ -81,10 +74,20 @@ ''' Upload processed data to Carto ''' -logger.info('Uploading processed data to Carto.') -util_carto.upload_to_carto(processed_data_points, 'LINK') -util_carto.upload_to_carto(processed_data_extent, 'LINK') - +# create schema for the point dataset on Carto +CARTO_SCHEMA_pt= util_carto.create_carto_schema(gdf_points) +# create empty table for point locations on Carto +util_carto.checkCreateTable(os.path.basename(processed_data_points).split('.')[0], CARTO_SCHEMA_pt) + +# create schema for the extent shapefile on Carto +CARTO_SCHEMA_extent = util_carto.create_carto_schema(gdf_extent) +# create empty table for the extent on Carto +util_carto.checkCreateTable(os.path.basename(processed_data_extent).split('.')[0], CARTO_SCHEMA_extent) + +# upload the dataset to Carto and set the privacy to be 'Public with Link' +util_carto.shapefile_to_carto(os.path.basename(processed_data_points).split('.')[0], CARTO_SCHEMA_pt, gdf_points, 'LINK') +# upload the mask to Carto and set the privacy to be 'Public with Link' +util_carto.shapefile_to_carto(os.path.basename(processed_data_extent).split('.')[0], CARTO_SCHEMA_extent, gdf_extent, 'LINK') ''' Upload original data and processed data to Amazon S3 storage ''' @@ -105,7 +108,11 @@ logger.info('Uploading processed data to S3.') # Copy the processed data into a zipped file to upload to S3 processed_data_dir = os.path.join(data_dir, dataset_name+'_edit.zip') +# find all the necessary components of the two shapefiles +processed_pt_files = glob.glob(os.path.join(data_dir, dataset_name + '_points_edit.*')) +processed_extent_files = glob.glob(os.path.join(data_dir, dataset_name +'_extent_edit.*')) with ZipFile(processed_data_dir,'w') as zip: - zip.write(processed_files, os.path.basename(processed_files)) + for file in processed_pt_files + processed_extent_files: + zip.write(file, os.path.basename(file)) # Upload processed data file to S3 uploaded = util_cloud.aws_upload(processed_data_dir, aws_bucket, s3_prefix+os.path.basename(processed_data_dir)) \ No newline at end of file From f4a3be113f15ee77b9423df713833e0c4a15312a Mon Sep 17 00:00:00 2001 From: jwinik <67568423+jwinik@users.noreply.github.com> Date: Thu, 19 Aug 2021 11:01:17 -0400 Subject: [PATCH 10/10] Apply suggestions from code review Co-authored-by: adawyj97 <49011074+adawyj97@users.noreply.github.com> --- cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py b/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py index b01cd72f..0357bde3 100644 --- a/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py +++ b/cli_017_rw1_glacier_locations/cli_017_rw1_glacier_locations.py @@ -26,7 +26,7 @@ # name of table on Carto where you want to upload data # this should be a table name that is not currently in use -dataset_name = 'cli_017_rw1_glacier_locations' #check +dataset_name = 'cli_017_rw2_glacier_locations' #check logger.info('Executing script for dataset: ' + dataset_name) # create a new sub-directory within your specified dir called 'data'