quantifyearth · mdales · Jul 18, 2024 · Jul 29, 2024 · Jul 29, 2024 · Jul 30, 2024
diff --git a/docs/runorder.md b/docs/runorder.md
@@ -71,12 +71,6 @@ Firstly we have access to healthcare data:
 python3 -m methods.inputs.download_accessibility /data/tmf/access/raw.tif
 ```
 
-Which must then be turned into smaller tiles:
-
-```shark-run:gdalenv
-python3 -m methods.inputs.generate_access_tiles /data/tmf/access/raw.tif /data/tmf/jrc/tif /data/tmf/access
-```
-
 ## Country boarders
 
 We use OSM data for country borders data. This requires an API key to access the downloads. To get an API key you must follow these steps:
@@ -134,26 +128,30 @@ DATA_PATH="/data/tmf/gedi"
 Once you have this the download script to pull the raw GEDI data is:
 
 ```shark-run:gdalenv
-python3 -m methods.inputs.locate_gedi_data /data/tmf/123/buffer.geojson /data/tmf/gedi/granule/info/
-python3 -m methods.inputs.download_gedi_data /data/tmf/gedi/granule/info/* /data/tmf/gedi/granule/
+python3 -m methods.inputs.locate_gedi_data /data/buffer.geojson /data/gedi/info/
+python3 -m methods.inputs.download_gedi_data /data/gedi/info/* /data/gedi/granule/
 ```
 
 Each GEDI granule is a complete sweep from the ISS along the path that crossed the buffer zone, and so contains
 much more data than we need. We thus need to filter this to give us just the GEDI shots we're interested in. At the
 same time we filter based on time and quality as per the methodology document:
 
 ```shark-run:gdalenv
-python3 -m methods.inputs.filter_gedi_data --buffer /data/tmf/123/buffer.geojson \
-                                          --granules /data/tmf/gedi/granule/ \
-                                          --output /data/tmf/123/gedi.geojson
+python3 -m methods.inputs.filter_gedi_data --buffer /data/buffer.geojson \
+                                          --granules /data/gedi/granule/ \
+                                          --output /data/gedi.geojson
 ```
 
 Once filtered we can then combine the GEDI data with the JRC data we have to work out the carbon density per land usage class:
 
 ```shark-run:gdalenv
-python3 -m methods.inputs.generate_carbon_density --jrc /data/tmf/jrc/tif \
-                                                  --gedi /data/tmf/123/gedi.geojson \
-                                                  --output /data/tmf/123/carbon-density.csv
+python3 -m methods.inputs.generate_carbon_density --jrc /data/jrc/AnnualChange/ \
+                                                  --gedi /data/gedi.geojson \
+                                                  --output /data/tmf/123/agb.csv
+```
+
+```shark-publish
+/data/tmf/123/agb.csv
 ```
 
 ## Generate matching area
@@ -185,7 +183,6 @@ We use the SRTM elevation data, which we download to cover the matching area:
 ```shark-run:gdalenv
 python3 -m methods.inputs.download_srtm_data --project /data/tmf/project_boundaries/123.geojson \
                                             --matching /data/tmf/project_boundaries/123/matching-area.geojson \
-                                            --zips /data/tmf/srtm/zip \
                                             --tifs /data/tmf/srtm/tif
 ```
 
@@ -206,9 +203,11 @@ python3 -m methods.inputs.rescale_tiles_to_jrc --jrc /data/tmf/jrc/tif \
                                                  --output /data/tmf/rescaled-slopes
 ```
 
-## Country raster
+## Project baselayer rasters
 
-Again, rather than repeatedly dynamically rasterize the country vectors, we rasterise them once and re-use them:
+For various source inputs we generate a single raster layer that contains all we want, at the correct scale.
+
+First a map of countries:
 
 ```shark-run:gdalenv
 python3 -m methods.inputs.generate_country_raster --jrc /data/tmf/jrc/tif \
@@ -217,6 +216,24 @@ python3 -m methods.inputs.generate_country_raster --jrc /data/tmf/jrc/tif \
                                                   --output /data/tmf/123/countries.tif
 ```
 
+Then a raster map of ecoregions.
+
+```shark-run:gdalenv
+python3 -m methods.inputs.generate_ecoregion_rasters --jrc /data/jrc/AnnualChange/ \
+                                                     --matching /data/tmf/project_boundaries/123/matching-area.geojson \
+                                                     --ecoregions /data/tmf/ecoregions/ecoregions.geojson \
+                                                     --output /data/tmf/ecoregions/ecoregion.tif
+```
+
+Finally a map of access data.
+
+```shark-run:gdalenv
+python3 -m methods.inputs.generate_access_raster --jrc /data/jrc/AnnualChange/ \
+                                               --matching /data/tmf/project_boundaries/123/matching-area.geojson \
+                                               --access /data/tmf/access/raw.tif \
+                                               --output /data/tmf/123/access.tif
+```
+
 # Pixel matching
 
 Pixel matching is split into three main stages: Calculating K, then M, and then finding pairs between them.
@@ -229,39 +246,44 @@ First we make K.
 python3 -m methods.matching.calculate_k --project /data/tmf/project_boundaries/123.geojson \
                                           --start_year 2012 \
                                           --evaluation_year 2021 \
-                                          --jrc /data/tmf/jrc/tif \
-                                          --cpc /data/tmf/fcc-cpcs \
-                                          --ecoregions /data/tmf/ecoregions \
-                                          --elevation /data/tmf/rescaled-elevation \
-                                          --slope /data/tmf/rescaled-slopes \
-                                          --access /data/tmf/access \
+                                          --jrc /data/jrc/AnnualChange/ \
+                                          --cpc /data/jrc/cpc/ \
+                                          --ecoregions /data/tmf/ecoregions/ecoregion.tif \
+                                          --elevation /data/tmf/rescaled-elevation/ \
+                                          --slope /data/tmf/rescaled-slopes/ \
+                                          --access /data/tmf/123/access.tif \
                                           --countries-raster /data/tmf/123/countries.tif \
                                           --output /data/tmf/123/k.parquet
 ```
 
+```shark-publish
+/data/tmf/123/k.parquet
+```
+
+
 ## Calculate set M
 
 Calculating the set M is a three step process. First we generate a raster per K that has the matches for that particular pixel:
 
 ```shark-run:gdalenv
 python3 -m methods.matching.find_potential_matches --k /data/tmf/123/k.parquet \
-                                                     --matching /data/tmf/123/matching-area.geojson \
+                                                     --matching /data/tmf/project_boundaries/123/matching-area.geojson \
                                                      --start_year 2012 \
                                                      --evaluation_year 2021 \
-                                                     --jrc /data/tmf/jrc/tif \
-                                                     --cpc /data/tmf/fcc-cpcs \
-                                                     --ecoregions /data/tmf/ecoregions \
-                                                     --elevation /data/tmf/rescaled-elevation \
-                                                     --slope /data/tmf/rescaled-slopes \
-                                                     --access /data/tmf/access \
-                                                     --countries-raster /data/tmf/123/countries.tif \
-                                                     --output /data/tmf/123/matches
+                                                      --jrc /data/jrc/AnnualChange/ \
+                                                      --cpc /data/jrc/cpc/ \
+                                                      --ecoregions /data/tmf/ecoregions/ecoregion.tif \
+                                                      --elevation /data/tmf/rescaled-elevation/ \
+                                                      --slope /data/tmf/rescaled-slopes/ \
+                                                      --access /data/tmf/123/access.tif \
+                                                      --countries-raster /data/tmf/123/countries.tif \
+                                                     --output /data/tmf/123/matches/
 ```
 
 Then these rasters get combined into a single raster that is all the potential matches as one. The j parameter controls how many concurrent processes the script can use, which is bounded mostly by how much memory you have available. The value 20 is good for our server, but may not match yours.
 
 ```shark-run:gdalenv
-python3 -m methods.matching.build_m_raster --rasters_directory /data/tmf/123/matches \
+python3 -m methods.matching.build_m_raster --rasters_directory /data/tmf/123/matches/ \
                                           --output /data/tmf/123/matches.tif \
                                           -j 20
 ```
@@ -270,19 +292,25 @@ We then convert that raster into a table of pixels plus the data we need:
 
 ```shark-run:gdalenv
 python3 -m methods.matching.build_m_table --raster /data/tmf/123/matches.tif \
-                                            --matching /data/tmf/123/matching-area.geojson \
+                                                     --matching /data/tmf/project_boundaries/123/matching-area.geojson \
                                             --start_year 2012 \
                                             --evaluation_year 2021 \
-                                            --jrc /data/tmf/jrc/tif \
-                                            --cpc /data/tmf/fcc-cpcs \
-                                            --ecoregions /data/tmf/ecoregions \
-                                            --elevation /data/tmf/rescaled-elevation \
-                                            --slope /data/tmf/rescaled-slopes \
-                                            --access /data/tmf/access \
-                                            --countries-raster /data/tmf/123/countries.tif \
+                                          --jrc /data/jrc/AnnualChange/ \
+                                          --cpc /data/jrc/cpc/ \
+                                          --ecoregions /data/tmf/ecoregions/ecoregion.tif \
+                                          --elevation /data/tmf/rescaled-elevation/ \
+                                          --slope /data/tmf/rescaled-slopes/ \
+                                          --access /data/tmf/123/access.tif \
+                                          --countries-raster /data/tmf/123/countries.tif \
                                             --output /data/tmf/123/matches.parquet
 ```
 
+
+```shark-publish
+/data/tmf/123/matches.parquet
+```
+
+
 ## Find pairs
 
 Finally we can find the 100 sets of matched pairs. The seed is used to control the random number generator, and using the same seed on each run ensures consistency despite randomness being part of the selection process.
@@ -291,9 +319,8 @@ Finally we can find the 100 sets of matched pairs. The seed is used to control t
 python3 -m methods.matching.find_pairs --k /data/tmf/123/k.parquet \
                                          --m /data/tmf/123/matches.parquet \
                                          --start_year 2012 \
-                                         --output /data/tmf/123/pairs \
-                                         --seed 42 \
-                                         -j 1
+                                         --output /data/tmf/123/pairs/ \
+                                         --seed 42
 ```
 
 # Calculate additionality
@@ -304,9 +331,14 @@ Finally this script calculates additionality:
 python3 -m methods.outputs.calculate_additionality --project /data/tmf/project_boundaries/123.geojson \
                                                     --project_start 2012 \
                                                     --evaluation_year 2021 \
-                                                    --density /data/tmf/123/carbon-density.csv \
-                                                    --matches /data/tmf/123/pairs \
+                                                    --density /data/tmf/123/agb.csv \
+                                                    --matches /data/tmf/123/pairs/ \
                                                     --output /data/tmf/123/additionality.csv
+cat /data/tmf/123/additionality.csv
 ```
 
-By running the additionality step with the environment variable `TMF_PARTIALS` set to some directory, this step will also generate GeoJSON files for visualising the pairs and the standardised mean difference calculations for the matching variables. You can add `TMF_PARTIALS=/some/dir` before `python3` to set this just for a specific run of `calculate_additionality`. 
+By running the additionality step with the environment variable `TMF_PARTIALS` set to some directory, this step will also generate GeoJSON files for visualising the pairs and the standardised mean difference calculations for the matching variables. You can add `TMF_PARTIALS=/some/dir` before `python3` to set this just for a specific run of `calculate_additionality`.
+
+```shark-publish
+/data/tmf/123/additionality.csv
+```
diff --git a/methods/inputs/download_accessibility.py b/methods/inputs/download_accessibility.py
@@ -1,14 +1,14 @@
+import io
 import os
 import sys
 import tempfile
 import traceback
+import zipfile
 from shutil import move
 from glob import glob
 
 import requests
 
-from biomassrecovery.utils.unzip import unzip  # type: ignore
-
 # As taken from https://malariaatlas.org/open-access-policy/
 # Under "Creative Commons Attribution 3.0 Unported License"
 ACCESS_DATA = "https://data.malariaatlas.org/geoserver/Accessibility/ows?service=CSW&version=2.0.1" \
@@ -22,20 +22,13 @@ def is_tif(fname : str) -> bool:
 
 def download_accessibility_tif(source_url: str, target_path: str) -> None:
     with tempfile.TemporaryDirectory() as tmpdir:
-        download_path = os.path.join(tmpdir, "accessibility.zip")
-        response = requests.get(source_url, stream=True, timeout=60)
-        if not response.ok:
-            raise DownloadError(f'{response.status_code}: {response.reason}')
-        with open(download_path, 'wb') as output_file:
-            print("Downloading accessibility data...")
-            for chunk in response.iter_content(chunk_size=1024*1024):
-                output_file.write(chunk)
-        print(f"Unzipping from {download_path}")
-        unzip(
-            download_path,
-            tmpdir,
-            filter_func=is_tif
-        )
+        with requests.get(source_url, stream=True, timeout=60) as response:
+            if not response.ok:
+                raise DownloadError(response.status_code, response.reason, source_url)
+            with zipfile.ZipFile(io.BytesIO(response.content)) as zipf:
+                members = zipf.namelist()
+                for member in members:
+                    _ = zipf.extract(member, path=tmpdir)
 
         # There should only be a single TIF file
         tif = glob("*.tif", root_dir=tmpdir)

diff --git a/methods/inputs/download_jrc_data.py b/methods/inputs/download_jrc_data.py
@@ -1,5 +1,6 @@
 import argparse
 import io
+import logging
 import multiprocessing
 import os
 import shutil
@@ -13,13 +14,19 @@
 import requests
 import shapely # type: ignore
 
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
+)
+
 URL_FORMAT = "https://ies-ows.jrc.ec.europa.eu/iforce/tmf_v1/download.py?type=tile&dataset=AnnualChange&lat=%s&lon=%s"
 
 def download_jrc_dataset(tempdir: str, output_dir: str, tile_url: str) -> None:
     with requests.get(tile_url, stream=True, timeout=60) as response:
         if response.status_code == HTTPStatus.NOT_FOUND:
+            logging.warning("No file found at %s", tile_url)
             return
         if response.status_code != HTTPStatus.OK:
+            logging.warning("Bad status for %s: %d", tile_url, response.status_code)
             raise ValueError(response.status_code)
         try:
             with zipfile.ZipFile(io.BytesIO(response.content)) as zzip:
@@ -32,8 +39,10 @@ def download_jrc_dataset(tempdir: str, output_dir: str, tile_url: str) -> None:
                         os.rename(target, final_path)
                     except OSError:
                         shutil.move(target, final_path)
+            logging.info("Successfully fetched %s", tile_url)
         except zipfile.BadZipFile:
-            pass
+            logging.error("Bad zip file %s: %s", tile_url, response.content)
+            raise
 
 def download_jrc_data(
     target_tif_directory: str,
@@ -64,6 +73,7 @@ def download_jrc_data(
             url = URL_FORMAT % (slat, slng)
             tile_urls.append(url)
 
+    logging.info("Found %s tiles", len(tile_urls))
     with tempfile.TemporaryDirectory() as tempdir:
         core_count = int(multiprocessing.cpu_count() / 4)
         n_workers = min(len(tile_urls), core_count)

diff --git a/methods/inputs/download_shapefiles.py b/methods/inputs/download_shapefiles.py
@@ -1,13 +1,13 @@
+import io
 import os
 import sys
 import tempfile
+import zipfile
 from glob import glob
 
 import geopandas # type: ignore
 import requests
 
-from biomassrecovery.utils.unzip import unzip  # type: ignore
-
 from ..common import DownloadError
 
 # As taken from the World Bank data catalog https://datacatalog.worldbank.org/search/dataset/0038272
@@ -32,26 +32,27 @@ class UnpackError(Exception):
 # 4: convert it to something more useful (i.e., geojson/gpkg)
 def download_country_polygons(source_url: str, target_filename: str) -> None:
     with tempfile.TemporaryDirectory() as tmpdir:
-        download_path = os.path.join(tmpdir, "countries.zip")
-        response = requests.get(source_url, stream=True, timeout=60)
-        if not response.ok:
-            raise DownloadError(response.status_code, response.reason, source_url)
-        with open(download_path, 'wb') as output_file:
-            for chunk in response.iter_content(chunk_size=1024*1024):
-                output_file.write(chunk)
-        unzip_path = os.path.join(tmpdir, "countries")
-        unzip(download_path, unzip_path)
+        with requests.get(source_url, stream=True, timeout=60) as response:
+            if not response.ok:
+                raise DownloadError(response.status_code, response.reason, source_url)
+            try:
+                with zipfile.ZipFile(io.BytesIO(response.content)) as zipf:
+                    members = zipf.namelist()
+                    for member in members:
+                        _ = zipf.extract(member, path=tmpdir)
+            except zipfile.BadZipFile as exc:
+                print(response.content)
+                raise exc
 
         # find the shape file and convert it to a geojson
-
-        shape_files = glob("*.shp", root_dir=unzip_path)
+        shape_files = glob("*.shp", root_dir=tmpdir)
         matches = len(shape_files)
         if matches == 0:
             raise UnpackError("No shape files found in archive")
         elif matches > 1:
             raise UnpackError("Too many shape files found")
 
-        shape_file_path = os.path.join(unzip_path, shape_files[0])
+        shape_file_path = os.path.join(tmpdir, shape_files[0])
         shape_file_data = geopandas.read_file(shape_file_path)
         shape_file_data.to_file(target_filename, driver='GeoJSON')