diff --git a/.gitignore b/.gitignore index eac8079..123285e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ .DS_Store *.npy data - +Figures # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/evaluations/ex_ante_evaluation_template.Rmd b/evaluations/ex_ante_evaluation_template.Rmd new file mode 100644 index 0000000..3a92a4c --- /dev/null +++ b/evaluations/ex_ante_evaluation_template.Rmd @@ -0,0 +1,1003 @@ +--- +output: + html_document: + theme: spacelab + df_print: paged + toc: yes + toc_float: yes + pdf_document: + toc: yes +params: + proj: null + t0: null + input_dir: null + output_dir: null + fullname: null + country_path: null + shapefile_path: null + pairs_path: null + carbon_density_path: null + branch: null +--- + +```{r include=FALSE} + +# TO KNIT THIS NOTEBOOK, RUN THE FOLLOWING LINE IN A SHELL TERMINAL: + +# Rscript -e "rmarkdown::render(input='~/evaluations/R/ex_ante_evaluation_template.Rmd', output_file='~/evaluations/R/example_project.html', params=list(proj='example_project', t0=2010, ...))" + +# Mandatory args: proj, t0 +# Optional args: input dir, output dir, fullname, country_path, shapefile path, pairs_path, carbon density path +# You must either specify input dir and output dir OR provide absolute paths to each of the objects required + +``` + +```{r settings, include=FALSE} +knitr::opts_chunk$set( + echo = FALSE, warning=FALSE,message=FALSE) + +library(tidyverse) +library(sf) +library(reshape2) +library(maps) +library(mapdata) +library(ggspatial) +library(arrow) +library(rnaturalearth) +library(rnaturalearthdata) +library(rnaturalearthhires) +library(stringr) +library(jsonlite) +library(countrycode) +library(scales) +library(here) +library(patchwork) +library(knitr) +library(kableExtra) + +``` + +```{r read_inputs, echo=FALSE,warning=FALSE, message=FALSE} + +project_name <- params$proj +start_year <- as.numeric(params$t0) +branch <- params$branch + +``` + +--- +title: "`r paste0('4C Ex-Ante Evaluation: ', if (!is.null(params$fullname)) { params$fullname } else { gsub('_', ' ', project_name) %>% stringr::str_to_title() })`" +subtitle: "`r format(Sys.Date(), "%B %Y")`" +--- + +```{r set_paths, echo=FALSE,warning=FALSE, message=FALSE} + +# get output format + +output_format <- ifelse(knitr::is_latex_output(), "latex", "html") + +# get script path + +script_path <- here('scripts') + +# get explainer diagram path + +diagram_path <- here('methods_diagram.png') + +# get data path + +if (!is.null(params$output_dir)) { + data_path <- paste0(params$output_dir,'/',project_name) +} + +# get path to pairs + +if (!is.null(params$pairs_path)) { + pairs_path <- params$pairs_path +} else { pairs_path <- file.path(data_path,'pairs') } + +# read shapefile + +if (!is.null(params$input_dir)) { + input_dir <- params$input_dir +} + +if (!is.null(params$shapefile_path)) { + shapefile_path <- params$shapefile_path +} else { shapefile_path <- paste0(input_dir,'/',project_name,'.geojson') } +shapefile <- read_sf(shapefile_path) + +# read carbon density + +if (!is.null(params$carbon_density_path)) { + carbon_density_path <- params$carbon_density_path +} else { carbon_density_path <- list.files(data_path,pattern='carbon',full.names=T)[1] } +carbon_density <- read.csv(carbon_density_path) + +# read country path + +if (!is.null(params$country_path)) { + country_path <- params$country_path +} else { country_path <- list.files(path = data_path, pattern = "country-list", full.names = TRUE)} + +``` + +```{r read_pairs, echo=FALSE} + +# get filenames and filter for matched points + +files_full_raw <- list.files(pairs_path, + pattern='*.parquet',full.names=T,recursive=F) +files_full <- files_full_raw[!grepl('matchless',files_full_raw)] +files_short_raw <- list.files(path=pairs_path, + pattern='*.parquet',full.names=F,recursive=F) +files_short <- files_short_raw[!grepl('matchless',files_short_raw)] + +# initialise dfs + +vars <- c(colnames(read_parquet(files_full[1])),'pair') +paired_data_raw <- data.frame(matrix(ncol = length(vars), nrow = 0)) %>% + setNames(vars) %>% + mutate( + pair = as.factor(pair), + k_trt = as.factor(k_trt), + s_trt = as.factor(s_trt) + ) + +for(j in 1:length(files_full)){ + + # read parquet file + + f <- data.frame(read_parquet(files_full[j]),check.names = FALSE) + + # add identity column + + f$pair <- as.factor(c(replicate(nrow(f),str_remove(files_short[j], "\\.parquet$")))) + + # append data to bottom of df + + paired_data_raw <- bind_rows(paired_data_raw,f) + +} + +# generate separate datasets for project and counterfactual + +project <- paired_data_raw %>% dplyr::select(starts_with('k'),pair) +cf <- paired_data_raw %>% dplyr::select(starts_with('s'),pair) + +# create project-counterfactual merged dataset + +colnames(cf) <- colnames(project) +pair_merged <- bind_rows(project,cf) +names(pair_merged) <- str_sub(names(pair_merged),3) +names(pair_merged)[names(pair_merged) == "ir"] <- "pair" + +# add type column and remove excess cols + +data <- pair_merged %>% + mutate(type=c(replicate(nrow(project),'Project'),replicate(nrow(cf),'Counterfactual'))) %>% + select(-c(contains('trt'),ID)) + +``` + +```{r get_shapefile_area, echo=FALSE} + +project_area_ha <- as.numeric(sum(st_area(st_make_valid(shapefile)))/10000) + +``` + +```{r get_country_names} + +# define function for extracting country names + +get_country_names <- function(country_codes_path) { + codes <- as.character(fromJSON(country_codes_path)) + country_names <- countrycode(codes, 'iso2c', 'country.name') + country_names[country_names == 'Congo - Kinshasa'] <- 'Democratic Republic of the Congo' + return(country_names) + } + +# get country names + +country_vec <- get_country_names(country_path) + + # define function for printing the country names if there are multiple + + if (length(country_vec) > 1) { + country_string <- paste(country_vec[-length(country_vec)], collapse = ", ") + country_string <- paste(country_string, "and", country_vec[length(country_vec)]) + } else { + country_string <- country_vec[1] + } + + +``` + +\ + +# Introduction + +This Report has been prepared by researchers at the Cambridge Centre for Carbon Credits (4C) and has been funded by a charitable grant from the Tezos Foundation. 4C utilises innovative, evidence-based approaches to examine the scientific basis of nature-based carbon conservation initiatives and, insodoing, provides a way for different stakeholders to assess the quality of carbon credits (ex post and/or ex ante). + +**Disclaimer: Nothing in this Report constitutes formal advice or recommendations, an endorsement of proposed action, or intention to collaborate; instead, it sets out the details of an evaluation using a method which is still under development. The Report is considered complete as of the publication date shown, though methods are likely to change in future.** + +\ + +# About the project + +`r if (!is.null(params$fullname)) { params$fullname } else { gsub('_', ' ', project_name) %>% stringr::str_to_title() }` is located in `r country_string`. The proposed area measures `r format(project_area_ha, big.mark = ",", scientific = FALSE, digits = 3)` hectares. + +For the purposes of this evaluation, we have set the proposed start date to `r start_year`. + +```{r echo=FALSE} + +# ________________ FILL IN PROJECT-SPECIFIC INFORMATION __________________ + +# Replace this chunk with a short narrative about the context of the project and why it was chosen for evaluation by 4C. Include any other details relevant to the interpretation of this document. + +``` + + + +\ + +# Introduction to the 4C method + +*Our method for forecasting ex-ante additionality remains under development.* + +The 4C approach to forecasting additionality involves identifying places that experienced similar deforestation levels in the past as the project area does today. We start by analyzing forest cover changes in the project area between 10 years ago and the present day. Using pixel-matching techniques, we then identify comparable places outside the project that experienced similar deforestation trends between 20 and 10 years ago (the *matching period*). This allows us to match the deforestation trajectory of the project with that of the matched pixels, but offset in time. This concept is illustrated by the left-hand diagonal arrow in the figure below. + +We can consider the matched pixels as a historical representation of the project as it is today. By examining deforestation in the matched pixels over the subsequent 10 years (the *baseline period*), we estimate a *baseline prediction* — the deforestation expected in the project area under the counterfactual (business-as-usual) scenario. This rate is then projected forward over the next 10 years, as illustrated by the right-hand diagonal arrow in the figure below. We convert the deforestation rate to carbon dioxide emissions using best estimates of carbon density. + +```{r, echo=FALSE, fig.align='center', fig.width=6} + +knitr::include_graphics(diagram_path) + +``` + + +Making predictions about future deforestation is challenging, and there are multiple sources of uncertainty at play. These include: the quantification of carbon, the choice of matching pixels, the effect of leakage and impermanence, future political changes and market forces. We are constantly improving our method in order to minimise these uncertainties. Due to the inherent uncertainty associated with ex-ante (before-the-fact) predictions, carbon credits should only ever be quantified and issued ex-post (after the fact). + +More information about 4C's approach to impact evaluation can be found below. + +[Quantify Earth, an interactive platform which demonstrates our pixel-matching approach in action](https://quantify.earth/#/globe/1) + +[4C explainer page](https://4c.cst.cam.ac.uk/about/additionality-leakage-and-permanence) + +[Cambridge Carbon Impact: an evaluation of some existing carbon projects](https://www.cambridge.org/engage/coe/article-details/6409c345cc600523a3e778ae) + +[Our paper on the social value of impermanent carbon credits](https://www.nature.com/articles/s41558-023-01815-0) + +[The PACT methodology for ex-post evaluations](https://www.cambridge.org/engage/coe/article-details/657c8b819138d23161bb055f) + +\ + + +# Methods + +The following sections will detail how we arrived at a forecast of future deforestation and the potential to generate additionality by reducing this deforestation. This includes the location and quality of the matched points, the deforestation rates in each set of points, and the carbon density values used to convert these deforestation rates to carbon emissions reductions. + +\ + +### Location of matched points + +We sampled `r format((nrow(data)/2), big.mark = ",", scientific = FALSE, digits = 3)` points within the project and an equal number of matching points from outside of the project. We used these matched points to make a prediction of the counterfactual scenario for deforestation. + +Below we show the location of the matching points (shown in blue) relative to the project (shown in red), both at the country and project scale. + +`r if(nrow(data) > 20000){"Note that, for clarity and computational efficiency, we show only 10% of the points."}` + +```{r match_locations, warning=FALSE, echo=FALSE, message=FALSE} + +# downsample no. of points by 90% + +if(nrow(data) > 20000){ + data_forplot <- data %>% sample_frac(0.1) +} else { + data_forplot <- data +} + +# plot location of matching points + +country_map <- ne_countries(country = country_vec, returnclass = "sf", scale= "large") + +# transform crs + +shapefile <- st_transform(shapefile, st_crs(country_map)) + +ggplot(data=country_map) + + geom_sf(colour=NA,fill='grey80')+ + geom_point(data=data_forplot,mapping=aes(x=lng,y=lat,colour=type),alpha=0.5,size=0.5) + + scale_color_manual(values=c('blue','red'),labels=c('Matched points','Project'))+ + coord_sf()+ + theme_void()+ + annotation_scale(text_cex=1.5,location='bl')+ + theme(legend.title = element_blank(), + text=element_text(size=16), + legend.position='none') + +xmin <- filter(data, type=='Project') %>% select(lng) %>% min() +xmax <- filter(data, type=='Project') %>% select(lng) %>% max() +ymin <- filter(data, type=='Project') %>% select(lat) %>% min() +ymax <- filter(data, type=='Project') %>% select(lat) %>% max() + +ggplot(data=country_map) + + geom_sf(colour=NA,fill='grey80')+ + geom_point(data=data_forplot,mapping=aes(x=lng,y=lat,colour=type),alpha=0.5,size=0.5) + + scale_color_manual(values=c('blue','red'),labels=c('Matched points','Project'))+ + coord_sf(xlim=c(xmin-0.5,xmax+0.5),ylim=c(ymin-0.5,ymax+0.5))+ + theme_void()+ + annotation_scale(text_cex=1.5,location='bl')+ + theme(legend.title = element_blank(), + text=element_text(size=16), + legend.position='none') + +``` + +### Quality of matches + +Here we show how well the matching points align with the project in terms of our matching variables. Correspondence between the project (shown in red in the plots below) and the matched points (shown in blue) indicates that the the matched points are composed of places that are similar to the project in terms of the drivers of deforestation and are expected to exhibit similar deforestation trends. + +- Inaccessibility (motorized travel time to healthcare, minutes) + +- Slope ($^\circ$) + +- Elevation (meters) + +- Forest cover at t0 (start year, %) + +- Deforestation at t0 (%) + +- Forest cover at t-5 (5 years prior to start year, %) + +- Deforestation at t-5 (%) + +- Forest cover at t-10 (10 years prior to start year, %) + +- Deforestation at t-10 (%) + +Forest cover and deforestation are measured as the proportion of pixels within a 1km radius of a particular point which are classified either as undisturbed forest (in the case of forest cover) or deforested (in the case of deforestation) by the JRC Tropical Moist Forest dataset. + +More information about the datasets we use can be found below: + +[MAP access to healthcare](https://malariaatlas.org/project-resources/accessibility-to-healthcare/) + +[SRTM elevation data](https://www.earthdata.nasa.gov/sensors/srtm) + +[JRC tropical moist forest dataset](https://forobs.jrc.ec.europa.eu/TMF) + +\ + +```{r match_quality, warning=FALSE,message=FALSE, echo=FALSE} + +# plot matches + +source(file.path(script_path,'plot_matchingvars.R')) + +plot_matching_variables(data,ex_ante='true') + +``` + +\ + +### Standardised mean differences + +We can quantify the similarity in matching variables in terms of their standardised mean difference (SMD). The SMD allows us to quantify the similarity between the project and the matched points in a way that is comparable across variables. + +In the below plot, the blue points indicate the SMD value (i.e. the amount of difference between the project and matched points, in standard deviations) for each variable. Values further from zero indicate a larger difference. The grey dotted lines at (-0.25, +0.25) represent the boundary within which our SMDs would ideally fall in order for the project and matched points to be considered well-matched. + +\ + +```{r smd} + +std_mean_diff <- function(pairs_path) { + + # clean data + + files_full_raw <- list.files(pairs_path, + pattern='*.parquet',full.names=T,recursive=F) + files_full <- files_full_raw[!grepl('matchless',files_full_raw)] + files_short_raw <- list.files(path=pairs_path, + pattern='*.parquet',full.names=F,recursive=F) + files_short <- files_short_raw[!grepl('matchless',files_short_raw)] + + # initialise dfs + + vars <- c(colnames(read_parquet(files_full[1])),'pair') + df <- data.frame(matrix(ncol=length(vars),nrow=0)) %>% + setNames(vars) %>% + mutate(k_trt=as.factor(k_trt), + s_trt=as.factor(s_trt)) + + for(j in 1:length(files_full)){ + + # read in all parquet files for a given project + + f <- data.frame(read_parquet(files_full[j])) %>% + mutate(k_trt=as.factor(k_trt), + s_trt=as.factor(s_trt)) + + # append data to bottom of df + + df <- bind_rows(df,f) + + } + + # calculate smd + + smd_results <- data.frame(variable = character(), smd = numeric(), stringsAsFactors = FALSE) + + variables <- c('cpc10_d','cpc5_d','cpc0_d', + 'cpc10_u','cpc5_u','cpc0_u', + 'access','slope','elevation') + + for (var in variables) { + k_var <- df[[paste0("k_", var)]] + s_var <- df[[paste0("s_", var)]] + + k_mean <- mean(k_var, na.rm = TRUE) + s_mean <- mean(s_var, na.rm = TRUE) + k_sd <- sd(k_var, na.rm = TRUE) + s_sd <- sd(s_var, na.rm = TRUE) + + pooled_sd <- sqrt((k_sd^2 + s_sd^2) / 2) + smd <- (k_mean - s_mean) / pooled_sd + + smd_results <- rbind(smd_results, data.frame(variable = var, smd = smd, stringsAsFactors = FALSE)) + } + + return(smd_results) +} + +results <- std_mean_diff(pairs_path) + +# changing sign for interpretation + +results$smd <- (-1)*results$smd + +# changing order of variables + +variables <- c('cpc10_d','cpc5_d','cpc0_d', + 'cpc10_u','cpc5_u','cpc0_u', + 'access','slope','elevation') + +results$variable <- factor(results$variable, levels=variables) + +# plotting + + ggplot(results,aes(x=smd,y=variable))+ + #geom_boxplot(outlier.shape=NA,colour='blue')+ + geom_point(colour='blue',fill='blue',alpha=0.3,size=4)+ + geom_vline(xintercept=0)+ + geom_vline(xintercept=0.25,lty=2,colour='grey30')+ + geom_vline(xintercept=-0.25,lty=2,colour='grey30')+ + scale_y_discrete(labels=c(bquote(Deforestation~t[-10]~("%")), + bquote(Deforestation~t[-5]~("%")), + bquote(Deforestation~t[0]~("%")), + bquote(Forest~cover~t[-10]~("%")), + bquote(Forest~cover~t[-5]~("%")), + bquote(Forest~cover~t[0]~("%")), + 'Inaccessibility (mins)',paste0('Slope (',intToUtf8(176),')'),'Elevation (m)'))+ + xlab('Standardised mean difference')+ + xlim(-1,1)+ + theme_classic()+ + theme(axis.title.y=element_blank(), + legend.title=element_blank(), + legend.box.background=element_rect(), + legend.position='none', + text=element_text(size=14), + axis.text.y=element_text(size=14)) + + +``` + +\ + +### Deforestation within the project + +Now focusing on deforestation within the project, we can examine the spatial distribution of the following 4 processes pertinent to forest carbon stock changes: + +- Undisturbed forest to degraded forest + +- Degraded forest to deforested land + +- Undisturbed forest to deforested land + +- Undisturbed land to reforested land (which indicates that regrowth occurred after a deforestation event) + +\ + +These transitions are shown in the plot below for the 10-year period between `r start_year-10` and `r start_year`. They are overlaid on the project area which is shown in grey. If a transition is not shown, it did not occur in the period examined. + +Data from [JRC tropical moist forest dataset](https://forobs.jrc.ec.europa.eu/TMF). + +\ + +```{r deforestation_spatial_distribution, echo=FALSE, warning=FALSE} + +# plot deforestation within project + +source(file.path(script_path,'plot_transitions.R')) + +proj_coords <- data %>% + filter(type=='Project') %>% + select(lat,lng) + +proj_input_defplot <- data %>% + filter(type=='Project') %>% + select(contains('luc')) %>% + setNames(paste0("luc_", (start_year-10):start_year)) %>% + cbind(proj_coords) + +proj_input_defplot <- proj_input_defplot[, !is.na(colnames(proj_input_defplot))] + +plot_transitions(data=proj_input_defplot,t0=start_year-10,period_length=10,shapefile=shapefile) + +``` + +\ + +### Land cover changes within project and matched pixels + +In the below plots, we show the changes in land classes over time for both the project (red) and the matched points (blue). + +Note the following: + +- The vertical grey dashed line represents the start year of the project (`r start_year`). The timings shown on the x-axis are relative to this start year. + +- As explained in the 'Methods' section, the matched points are offset in time relative to the project by 10 years. This means that all changes observed in the matched points happened 10 years prior to the equivalent time point in the project. This time offset allows us to use the last 10 years in the matched points as a prediction of the next 10 years for the project. + +- Solid lines represent ex-post observed changes, whereas the dotted line represents the prediction for the future of the project. + +```{r make_inputs, echo=FALSE} + +# preparing inputs + +proj_input <- data %>% + filter(type=='Project') %>% + select(contains('luc')) %>% + setNames(paste0("luc_", (start_year-10):start_year)) +proj_input <- proj_input[, !is.na(colnames(proj_input))] + + +cf_input <- data %>% + filter(type=='Counterfactual') %>% + select(contains('luc')) %>% + setNames(paste0("luc_", (start_year-20):(start_year))) %>% + select(where(~ all(!is.na(.)))) + +``` + +```{r luc_timeseries_all, echo=FALSE} + +source(file.path(script_path,'land_cover_timeseries.R')) + +# getting results + +proj_results <- get_luc_timeseries(proj_input,t0=start_year-10,tend=start_year,type='single') %>% + mutate(type='Project') + +cf_results <- get_luc_timeseries(cf_input,t0=start_year-20,tend=start_year,type='single') %>% + mutate(type='Counterfactual') + +# combining results + +results <- bind_rows(proj_results, cf_results) + +``` + +Showing the trend for undisturbed, degraded, deforested and regrowth in turn: + +```{r undisturbed_timeseries, fig.width=8,fig.height=13} + +# add prediction from the matched pixels: + +prediction <- cf_results %>% + filter(year >= (start_year-10)) %>% + mutate(type='Project', + year=year+10) + +results <- bind_rows(results,prediction) + +# make a custom function for plotting the results + +plot_timeseries <- function(luc_value, title_str) { + + #remove gap between solid and dotted project line + percent_val <- results %>% + filter(year == start_year + & type == "Project" + & luc == luc_value) %>% + pull(percentage) + + # df wrangling + extended_results <- results %>% + mutate( + luc = as.numeric(luc), + year = as.numeric(year), + line_type = ifelse(type == "Project" & year > start_year, "dotted", "solid"), + type = case_when( + type == "Counterfactual" ~ "Matched points", + TRUE ~ type + ) + ) %>% + bind_rows(data.frame( + year = start_year, + luc = luc_value, + percentage = percent_val, + type = 'Project', + line_type = 'dotted' + )) + + extended_results %>% + filter(luc == luc_value) %>% + ggplot(aes(x = year, y = percentage, color = type, linetype = line_type)) + + geom_line(linewidth = 1.5) + + geom_vline(xintercept = start_year, linetype = 2, color = 'grey30') + + #geom_vline(xintercept = start_year-10, linetype = 2, color = 'grey30') + + scale_colour_manual(name = 'Location', + values = c('red','blue'), + breaks = c('Project', 'Matched points'), + labels = c('Project', 'Matched points'))+ + xlab('Year') + + ylab('% cover') + + ggtitle(title_str) + + guides(linetype = "none") + + theme_classic() + + scale_linetype_manual(values = c("solid" = "solid", "dotted" = "dotted"))+ + facet_wrap(~type)+ + xlim(start_year-20,start_year+10) + +} + +plot_1 <- plot_timeseries(luc_value=1, title_str='Undisturbed forest') + theme(legend.position='none',axis.title.x = element_blank()) +plot_2 <- plot_timeseries(luc_value=2, title_str='Degraded forest') + theme(legend.position='none', axis.title.x = element_blank()) +plot_3 <- plot_timeseries(luc_value=3, title_str='Deforested land') + theme(legend.position='none', axis.title.x = element_blank()) +plot_4 <- plot_timeseries(luc_value=4, title_str='Regrowth') + theme(legend.position='none', axis.title.x = element_text(size=14)) + +plot_1 + plot_2 + plot_3 + plot_4 + plot_layout(ncol=1) + +``` + +### Deforestation rates in the matched points during the baseline period + +```{r proportions_undisturbed_degraded, echo=FALSE} + +# obtaining the area of undisturbed and degraded forest at t0, for use later + +source(file.path(script_path,'def_rate.R')) + +prop_und <- get_prop_class(data=proj_input,t0=start_year-10,class=1) +prop_deg <- get_prop_class(data=proj_input,t0=start_year-10,class=2) + +``` + +Here we present the deforestation rates observed in the matched pixels over the past 10 years (the baseline period). + +Forest loss transitions can be broken down into the following processes: + +- degradation of undisturbed forest + +- deforestation of undisturbed forest + +- deforestation of degraded forest + +- regrowth of undisturbed forest (implies previous deforestation) + +We calculate the rate at which these processes occur in the matched pixels using the following method: + +1. Calculate the percentage of matched pixels which have undergone one of the above processes (according to the JRC classification) during the baseline period. +2. Divide this percentage by 10 to give an annual rate of change, in %/year, relative to the amount of (undisturbed or degraded) forest present at the beginning of the project. +3. Multiply this rate by the area of the (undisturbed or degraded) forest 10 years prior to the project start to give an annual rate, in hectares per year. + +The amounts of forest in the project area 10 years prior to project start are as follows: + +- Undisturbed forest: `r format(100*prop_und, big.mark = ",", scientific = FALSE, digits = 3)`% + +- Degraded forest: `r format(100*prop_deg, big.mark = ",", scientific = FALSE, digits = 3)`% + +The rates are given below. + +```{r rate_of_forest_loss_ha, echo=FALSE} + +source(file.path(script_path,'def_rate.R')) + +df_rate_percent <- def_rate_single(data=cf_input,t0=start_year-10,period_length=10) + +df_rate_ha <- df_rate_percent + +df_rate_ha[df_rate_ha$`Forest type`=='Undisturbed forest',3] <- (df_rate_ha[df_rate_ha$`Forest type`=='Undisturbed forest',3]/100)*project_area_ha*prop_und + +df_rate_ha[df_rate_ha$`Forest type`=='Disturbed forest',3] <- (df_rate_ha[df_rate_ha$`Forest type`=='Disturbed forest',3]/100)*project_area_ha*prop_deg + +knitr::kable( + df_rate_ha %>% + rename('Rate (ha/year)' = 3) %>% + mutate(across(where(is.numeric), ~ sprintf("%.2f", .))) +) + + +``` + +\ + +### Carbon stock changes in the matched points during the baseline period + +Here we present the carbon density calculations for this project. + +In order to convert land cover changes to carbon emissions, we use regional aboveground (AGB) carbon density values generated through NASA GEDI data. + +More information on GEDI data is available [here](https://www.earthdata.nasa.gov/sensors/gedi). + +Note that, in calculating carbon stock changes, we assume the following: + +- Belowground biomass (BGB) is 20% of AGB (based on Cairns et al. 1997) + +- Deadwood biomass is 11% of AGB (based on IPCC 2003) + +- Emitted carbon dioxide is 47% of total biomass (based on Martin and Thomas 2011) + + +\ +```{r additionality_forecast} + +baseline_stocks <- data.frame(matrix(nrow=2*nrow(carbon_density),ncol=9)) +colnames(baseline_stocks) <- c('time','luc','agb','bgb','dwb','total','total_c','area','total_byarea') +luc_counter <- 1 +row_counter <- 1 + +carbon_density <- filter(carbon_density, land.use.class %in% c(1:6)) + +for(i in carbon_density$land.use.class){ + + for(j in c("Start","End")) { + + # get agb + + agb <- carbon_density$carbon.density[luc_counter] + + # get other values + + bgb <- agb*0.2 + dw <- agb*0.11 + total <- agb + bgb + dw + total_c <- total*0.47 + + # get area of class i + + if (j == "Start") { + area_of_forest <- get_prop_class(cf_input,t0=start_year-10,class=i)*project_area_ha + } else if (j == "End") { + area_of_forest <- get_prop_class(cf_input,t0=start_year,class=i)*project_area_ha } + + # multiply total by area + + total_byarea <- total_c*area_of_forest + + # adding to df + + baseline_stocks[row_counter,1] <- j + baseline_stocks[row_counter,2] <- i + baseline_stocks[row_counter,3] <- agb + baseline_stocks[row_counter,4] <- bgb + baseline_stocks[row_counter,5] <- dw + baseline_stocks[row_counter,6] <- total + baseline_stocks[row_counter,7] <- total_c + baseline_stocks[row_counter,8] <- area_of_forest + baseline_stocks[row_counter,9] <- total_byarea + + row_counter <- row_counter+1 + + } + + # advance counter + + luc_counter <- luc_counter + 1 + +} + +# formatting bits + +baseline_stocks_format <- baseline_stocks +baseline_stocks_format <- baseline_stocks_format %>% filter(time == 'Start') +baseline_stocks_format <- baseline_stocks_format[2:6] + +colnames(baseline_stocks_format) <- c( + 'Land use class', + 'AGB (t / ha)', + 'BGB (t / ha)', + 'Deadwood biomass (t / ha)', + 'Total biomass (t / ha)', + 'Total carbon (t / ha)') + + +# renaming classes + +baseline_stocks_format <- baseline_stocks_format %>% + mutate(`Land use class` = case_when( + `Land use class` == "1" ~ 'Undisturbed', + `Land use class` == "2" ~ 'Degraded', + `Land use class` == "3" ~ 'Deforested', + `Land use class` == "4" ~ 'Reforested', + `Land use class` == "5" ~ 'Water', + `Land use class` == "6" ~ 'Other', + TRUE ~ as.character(`Land use class`) # ensure no unexpected values + )) + + +baseline_stocks_format[2:6] <- lapply(baseline_stocks_format[, 2:6], function(x) { + if (is.numeric(x)) comma(x) else x +}) + +# Print only carbon calculations at this stage + +baseline_stocks_format %>% + drop_na() %>% + mutate(across(where(is.numeric), ~ sprintf("%.2f", .))) %>% + kable(escape = FALSE, format = ifelse(knitr::is_latex_output(), "latex", "html")) %>% + kable_styling(bootstrap_options = "striped") + +``` + +# Results: baseline rate of carbon emissions + +In this section we present the annual rate of carbon loss due to deforestation in the matched points during the baseline period. We can take this to be a prediction of the counterfactual scenario for the project (the *baseline*). + +First we present the carbon stock changes observed in the matched points during the baseline period: + +```{r results} + +baseline_stock_changes <- baseline_stocks[c(1:2,8:9)] + +# reshape + +reshaped_data <- baseline_stock_changes %>% + mutate(luc = as.character(luc)) %>% + group_by(luc) %>% + summarize( + area_start = area[time == "Start"], + area_end = area[time == "End"], + area_diff = area_start - area_end, + c_start = total_byarea[time == "Start"], + c_end = total_byarea[time == "End"], + c_diff = c_start - c_end, + .groups = 'drop' + ) + +# get totals + +total_row <- reshaped_data %>% + summarize( + luc = "Total", + area_start = sum(area_start, na.rm = TRUE), + area_end = sum(area_end, na.rm = TRUE), + area_diff = sum(area_diff, na.rm = TRUE), + c_start = sum(c_start, na.rm = TRUE), + c_end = sum(c_end, na.rm = TRUE), + c_diff = sum(c_diff, na.rm = TRUE) + ) %>% + mutate(luc = as.character(luc)) + +baseline_stock_changes <- bind_rows(reshaped_data, total_row) + +# add in conversion to CO2 + +baseline_stock_changes <- baseline_stock_changes %>% + mutate(co2_diff = (44/12)*c_diff) + +# formatting bits + +baseline_stock_changes_format <- baseline_stock_changes %>% + mutate(across(where(is.numeric), ~ comma(.))) %>% + mutate(across(where(is.numeric), ~ sprintf("%.2f", as.numeric(.)))) + +if (knitr::is_html_output()) { + colnames(baseline_stock_changes_format) <- c( + 'Land use class', + 'Area at start (ha)', + 'Area at end (ha)', + 'Area loss (ha)', + 'Biomass at start (t)', + 'Biomass at end (t)', + 'Biomass loss (t)' + 'CO2 loss (t)') +} else if (knitr::is_latex_output()) { + colnames(baseline_stock_changes_format) <- c( + 'Land use class', + 'Area at start (ha)', + 'Area at end (ha)', + 'Area loss (ha)', + 'Biomass at start (t)', + 'Biomass at end (t)', + 'Biomass loss (t)' + 'CO$_{2}$ loss (t)') +} + +baseline_stock_changes_format <- baseline_stock_changes_format %>% + mutate(`Land use class` = case_when( + `Land use class` == "1" ~ 'Undisturbed', + `Land use class` == "2" ~ 'Degraded', + `Land use class` == "3" ~ 'Deforested', + `Land use class` == "4" ~ 'Reforested', + `Land use class` == "5" ~ 'Water', + `Land use class` == "6" ~ 'Other', + TRUE ~ as.character(`Land use class`) # ensure no unexpected values + )) + +baseline_stock_changes_format[nrow(baseline_stock_changes_format), 1] <- 'Total' + +filtered_data <- baseline_stock_changes_format %>% + drop_na() %>% + mutate(across(where(is.numeric), ~ sprintf("%.2f", .))) + +last_row_index <- nrow(filtered_data) + +filtered_data %>% + kable(escape = FALSE, format = ifelse(knitr::is_latex_output(), "latex", "html")) %>% + kable_styling(bootstrap_options = "striped") %>% + row_spec(last_row_index, bold = TRUE) + +``` + + +```{r results_summary} + +# find the difference + +delta_c <- as.numeric(baseline_stock_changes[nrow(baseline_stock_changes), ncol(baseline_stock_changes)]) +delta_c_annual <- delta_c/10 + +``` + +To calculate the baseline annual rate of carbon emissions, we sum the the differences in carbon stocks between the start and end of the baseline period, then divide the total by the length of the baseline period (10 years). + +**For this project, the baseline annual rate of carbon emissions, in tonnes of carbon dioxide per year, is `r format(delta_c_annual, big.mark = ",", scientific = FALSE, digits = 3)`.** This should be interpreted as the maximum estimated number of carbon credits that could be generated if the project mitigated 100% of the baseline level of deforestation, assuming this is confirmed by ex post observations. We present alternative mitigation scenarios below. + +### Expected additionality under different mitigation scenarios + +Additionality depends not only on baseline deforestation rate but the ability of the project to mitigate that deforestation. Therefore, we present the expected additionality under different mitigation scenarios (100% to 10% mitigation success). The 100% mitigation scenario occurs where the project is able to mitigate all of the deforestation that would have occurred under the counterfactual scenario. This scenario is unlikely to be realistic, but gives a sense of the deforestation risk faced by each area. It is more likely that only a proportion of this baseline deforestation is mitigated - for example, under the 50% mitigation scenario, the project is able to reduce the deforestation rate by half. The higher the proportion of the baseline deforestation that is mitigated, the greater the additionality of a project. + +Note that we present the raw additionality, without accounting for leakage and impermenance (discussed later). + +We are in the process of producing confidence intervals that reflect the uncertainty associated with the baseline, which will be added to future revisions of this document. + +```{r} + +scenarios <- data.frame(matrix(ncol=2,nrow=5)) +scenarios[1] <- c("10%","25%","50%","75%","100%") +scenarios[2] <- delta_c_annual*c(0.1,0.25,0.5,0.75,1) + +if (knitr::is_html_output()) { + colnames(scenarios) <- c('Scenario', + 'Additionality (t CO2 / year)') +} else if (knitr::is_latex_output()) { + colnames(scenarios) <- c('Scenario', + 'Additionality (t CO$_{2}$ / year)') +} + +scenarios <- scenarios %>% + mutate(across(where(is.numeric), comma)) + +knitr::kable( + scenarios +) + +``` + +\ + +# Accounting for leakage and impermanence + +Leakage and permanence are two factors that affect the long-term emissions reductions contributed by a project but **have not been included in this evaluation**. + +**Leakage** is the displacement of activities which deplete forest carbon stocks from the project to other areas due to the implementation of the project. In the worst case scenario, 100% of these activities are displaced, effectively nullifying the additionality of the project. Leakage is likely to be lower if the processes leading deforestation and degradation do not result in high yielding land uses, or if the carbon densities within the project are high compared with those in other areas where these activities are taking place. Leakage can also be reduced by interventions which improve yields in areas already under production. We can provide guidance on how this could be achieved. + +**Impermanence** occurs when the additionality generated by a project is reversed. Additional carbon stocks in forests are inherently vulnerable to these reversals. The estimates given in this evaluation assume that all carbon stored is permanent, but in reality this is unlikely to be the case. In future revisions of this document we aim to include indicative estimates of the equivalent permanence (the relative value of a impermanent credit relative to a permanent credit) for this project. + +You can find out more about our plans to deal with leakage and permanence in our [explainer page](https://4c.cst.cam.ac.uk/about/additionality-leakage-and-permanence). + +--- + +### Reproducibility + +This report was generated on `r format(Sys.Date(), "%B %d, %Y")` using the `r branch` of the [tmf-implementation code](https://github.com/quantifyearth/tmf-implementation). diff --git a/evaluations/methods_diagram.png b/evaluations/methods_diagram.png new file mode 100644 index 0000000..0e544b3 Binary files /dev/null and b/evaluations/methods_diagram.png differ diff --git a/evaluations/scripts/def_rate.R b/evaluations/scripts/def_rate.R new file mode 100644 index 0000000..6a4c417 --- /dev/null +++ b/evaluations/scripts/def_rate.R @@ -0,0 +1,328 @@ + + + +def_rate <- function(data,t0,period_length,process='all'){ + + # get name of column for start year + + t0_index <- grep(paste0('luc_',t0),colnames(data)) + + # filter down to pixels with undisturbed forest (JRC class 1) + + data_filtered <- data[data[,t0_index]==1,] + + # count 1s at t0 in project and match + + proj_1s <- data_filtered %>% filter(type=='Project') %>% nrow() + cf_1s <- data_filtered %>% filter(type=='Counterfactual') %>% nrow() + + # identify where there have been changes during the evaluation period + + tend <- t0 + period_length + + luc_tend <- data_filtered %>% + select(paste0('luc_',tend)) + + # choosing processes to measure + + if(process=='def_only'){ + + response <- case_when( + luc_tend==1 ~ 0, + luc_tend==2 ~ 0, + luc_tend==3 ~ 1, + luc_tend==4 ~ 0, + luc_tend>4 ~ 0) + + } else if(process=='deg_only'){ + + response <- case_when( + luc_tend==1 ~ 0, + luc_tend==2 ~ 1, + luc_tend==3 ~ 0, + luc_tend==4 ~ 0, + luc_tend>4 ~ 0) + + } else { + + response <- case_when( + luc_tend==1 ~ 0, + luc_tend==2 ~ 1, + luc_tend==3 ~ 1, + luc_tend==4 ~ 1, + luc_tend>4 ~ 0) + + } + + + data_filtered$response <- response + + # count up number of pixels where there have been changes for each type + + proj_changes <- data_filtered %>% filter(response==1 & type=='Project') %>% + nrow() + cf_changes <- data_filtered %>% filter(response==1 & type=='Counterfactual') %>% + nrow() + + # calculate deforestation rate (= the rate of loss of undisturbed forest) as a percentage + + proj_rate <- 100*(proj_changes/proj_1s)/period_length + cf_rate <- 100*(cf_changes/cf_1s)/period_length + + # make df + + df <- data.frame(matrix(ncol=2,nrow=1)) + colnames(df) <- c('Project','Counterfactual') + df[1,1] <- proj_rate + df[1,2] <- cf_rate + + return(df) + +} + + + +def_rate_seperate <- function(data,t0,period_length){ + + # get name of column for start year + + t0_index <- grep(paste0('luc_',t0),colnames(data)) + + # filter down to pixels with undisturbed forest (JRC class 1) + + data_filtered <- data[data[,t0_index]==1,] + + # count 1s at t0 in project and cf + + proj_1s <- data_filtered %>% filter(type=='Project') %>% nrow() + cf_1s <- data_filtered %>% filter(type=='Counterfactual') %>% nrow() + + # identify where there have been changes during the evaluation period + + tend <- t0 + period_length + + luc_tend <- data_filtered %>% + select(paste0('luc_',tend)) + + # measuring responses + + def_response <- case_when( + luc_tend==1 ~ 0, + luc_tend==2 ~ 0, + luc_tend==3 ~ 1, + luc_tend==4 ~ 0, + luc_tend>4 ~ 0) + + deg_response <- case_when( + luc_tend==1 ~ 0, + luc_tend==2 ~ 1, + luc_tend==3 ~ 0, + luc_tend==4 ~ 0, + luc_tend>4 ~ 0) + + ref_response <- case_when( + luc_tend==1 ~ 0, + luc_tend==2 ~ 0, + luc_tend==3 ~ 0, + luc_tend==4 ~ 1, + luc_tend>4 ~ 0) + + data_filtered$def_response <- def_response + data_filtered$deg_response <- deg_response + data_filtered$ref_response <- ref_response + + # count up number of pixels where there have been changes for each type + + proj_def_changes <- data_filtered %>% filter(def_response==1 & type=='Project') %>% + nrow() + cf_def_changes <- data_filtered %>% filter(def_response==1 & type=='Counterfactual') %>% + nrow() + + proj_deg_changes <- data_filtered %>% filter(deg_response==1 & type=='Project') %>% + nrow() + cf_deg_changes <- data_filtered %>% filter(deg_response==1 & type=='Counterfactual') %>% + nrow() + + proj_ref_changes <- data_filtered %>% filter(ref_response==1 & type=='Project') %>% + nrow() + cf_ref_changes <- data_filtered %>% filter(ref_response==1 & type=='Counterfactual') %>% + nrow() + + # calculate deforestation rate (= the rate of loss of undisturbed forest) as a percentage + + proj_def <- 100*(proj_def_changes/proj_1s)/period_length + cf_def <- 100*(cf_def_changes/cf_1s)/period_length + + proj_deg <- 100*(proj_deg_changes/proj_1s)/period_length + cf_deg <- 100*(cf_deg_changes/cf_1s)/period_length + + proj_ref <- 100*(proj_ref_changes/proj_1s)/period_length + cf_ref <- 100*(cf_ref_changes/cf_1s)/period_length + + # adding the degraded-to-deforested transition + + data_filtered_2 <- data[data[,t0_index]==2,] + + # count 2s at t0 in project and cf + + proj_2s <- data_filtered_2 %>% filter(type=='Project') %>% nrow() + cf_2s <- data_filtered_2 %>% filter(type=='Counterfactual') %>% nrow() + + # identify where there have been changes during the evaluation period + + luc_tend_2 <- data_filtered_2 %>% + select(paste0('luc_',tend)) + + def_response_2 <- case_when( + luc_tend_2==1 ~ 0, + luc_tend_2==2 ~ 0, + luc_tend_2==3 ~ 1, + luc_tend_2==4 ~ 0, + luc_tend_2>4 ~ 0) + + data_filtered_2$def_response_2 <- def_response_2 + + proj_def_changes_2 <- data_filtered_2 %>% filter(def_response_2==1 & type=='Project') %>% + nrow() + cf_def_changes_2 <- data_filtered_2 %>% filter(def_response_2==1 & type=='Counterfactual') %>% + nrow() + + proj_deg_to_def <- 100*(proj_def_changes_2/proj_2s)/period_length + cf_deg_to_def <- 100*(cf_def_changes_2/cf_2s)/period_length + + # make df + + df <- data.frame(matrix(ncol=4,nrow=8)) + + colnames(df) <- c('Process','Forest type','Location','Rate (%/year)') + + df[1] <- c(rep(c('Degradation','Deforestation','Deforestation','Reforestation'),each=2)) + df[2] <- c(rep(c('Undisturbed forest','Undisturbed forest','Disturbed forest','Undisturbed forest'),each=2)) + df[3] <- c(rep(c('Project','Counterfactual'),times=4)) + df[4] <- c(proj_deg,cf_deg,proj_def,cf_def,proj_deg_to_def,cf_deg_to_def,proj_ref,cf_ref) + + return(df) + +} + +get_prop_class <- function(data,t0,class){ + + t0_index <- grep(paste0('luc_',t0),colnames(data)) + data_filtered <- data[data[,t0_index]==class,] + + total_count <- data %>% nrow() + class_count <- data_filtered %>% nrow() + prop <- class_count/total_count + + return(prop) + +} + + +def_rate_single <- function(data,t0,period_length){ + + # get name of column for start year + + t0_index <- grep(paste0('luc_',t0),colnames(data)) + + # filter down to pixels with undisturbed forest (JRC class 1) + + data_filtered <- data[data[,t0_index]==1,] + + # count 1s at t0 in project and cf + + no_1s <- nrow(data_filtered) + + # identify where there have been changes during the evaluation period + + tend <- t0 + period_length + + luc_tend <- data_filtered %>% + select(paste0('luc_',tend)) + + # measuring responses + + def_response <- case_when( + luc_tend==1 ~ 0, + luc_tend==2 ~ 0, + luc_tend==3 ~ 1, + luc_tend==4 ~ 0, + luc_tend>4 ~ 0) + + deg_response <- case_when( + luc_tend==1 ~ 0, + luc_tend==2 ~ 1, + luc_tend==3 ~ 0, + luc_tend==4 ~ 0, + luc_tend>4 ~ 0) + + ref_response <- case_when( + luc_tend==1 ~ 0, + luc_tend==2 ~ 0, + luc_tend==3 ~ 0, + luc_tend==4 ~ 1, + luc_tend>4 ~ 0) + + data_filtered$def_response <- def_response + data_filtered$deg_response <- deg_response + data_filtered$ref_response <- ref_response + + # count up number of pixels where there have been changes for each type + + def_changes <- data_filtered %>% filter(def_response==1) %>% + nrow() + + deg_changes <- data_filtered %>% filter(deg_response==1) %>% + nrow() + + ref_changes <- data_filtered %>% filter(ref_response==1) %>% + nrow() + + # calculate deforestation rate (= the rate of loss of undisturbed forest) as a percentage + + def <- 100*(def_changes/no_1s)/period_length + + deg <- 100*(deg_changes/no_1s)/period_length + + ref <- 100*(ref_changes/no_1s)/period_length + + # adding the degraded-to-deforested transition + + data_filtered_2 <- data[data[,t0_index]==2,] + + # count 2s at t0 in project and cf + + no_2s <- data_filtered_2 %>% nrow() + + # identify where there have been changes during the evaluation period + + luc_tend_2 <- data_filtered_2 %>% + select(paste0('luc_',tend)) + + def_response_2 <- case_when( + luc_tend_2==1 ~ 0, + luc_tend_2==2 ~ 0, + luc_tend_2==3 ~ 1, + luc_tend_2==4 ~ 0, + luc_tend_2>4 ~ 0) + + data_filtered_2$def_response_2 <- def_response_2 + + def_changes_2 <- data_filtered_2 %>% filter(def_response_2==1) %>% + nrow() + + deg_to_def <- 100*(def_changes_2/no_2s)/period_length + + # make df + + df <- data.frame(matrix(ncol=3,nrow=4)) + + colnames(df) <- c('Process','Forest type','Rate (%/year)') + + df[1] <- c('Degradation','Deforestation','Deforestation','Reforestation') + df[2] <- c('Undisturbed forest','Undisturbed forest','Disturbed forest','Undisturbed forest') + df[3] <- c(deg,def,deg_to_def,ref) + + return(df) + +} \ No newline at end of file diff --git a/evaluations/scripts/land_cover_timeseries.R b/evaluations/scripts/land_cover_timeseries.R new file mode 100644 index 0000000..6490bf1 --- /dev/null +++ b/evaluations/scripts/land_cover_timeseries.R @@ -0,0 +1,111 @@ + +get_luc_timeseries <- function(data,t0,tend,type='both'){ + + years_list <- seq(t0,tend) + + if(type=='both'){ + + df <- data.frame(matrix(ncol=4,nrow=8*length(years_list))) + + colnames(df) <- c('year','type','luc','percentage') + + counter <- 1 + + for(year in years_list) { + + for(i in seq (1:4)) { + + for(type_value in c('Project','Counterfactual')) { + + total <- data %>% filter(type == type_value) %>% nrow() + + no_class_i <- data %>% filter(type == type_value & .data[[paste0('luc_',year)]]==i) %>% nrow() + + prop <- no_class_i/total + + df[counter,1] <- year + df[counter,2] <- type_value + df[counter,3] <- i + df[counter,4] <- prop*100 + + counter <- counter+1 + + } + + } + + } + + } else if(type=='single'){ + + df <- data.frame(matrix(ncol=3,nrow=4*length(years_list))) + + colnames(df) <- c('year','luc','percentage') + + counter <- 1 + + for(year in years_list) { + + for(i in seq (1:4)) { + + total <- data %>% nrow() + + no_class_i <- data %>% filter(.data[[paste0('luc_',year)]]==i) %>% nrow() + + prop <- no_class_i/total + + df[counter,1] <- year + df[counter,2] <- i + df[counter,3] <- prop*100 + + counter <- counter+1 + + } + + } + + } + + return(drop_na(df)) + +} + +luc_class1_uncertainty <- function(data,t0,tend) { + + years_list <- seq(t0-10,tend) + + df <- data.frame(matrix(ncol=4,nrow=2*length(unique(data$pair))*length(years_list))) + + colnames(df) <- c('year','type','pair','percent_class1') + + counter <- 1 + + for(year in years_list) { + + for(type_value in c('Project','Counterfactual')) { + + for(pair_id in unique(data$pair)) { + + total <- data %>% filter(type == type_value & pair == pair_id) %>% nrow() + + no_class_i <- data %>% filter(type == type_value & pair == pair_id & .data[[paste0('luc_',year)]]==1) %>% nrow() + + prop <- no_class_i/total + + df[counter,1] <- year + df[counter,2] <- type_value + df[counter,3] <- pair_id + df[counter,4] <- prop*100 + + counter <- counter+1 + + } + + } + + } + + return(drop_na(df)) + +} + diff --git a/evaluations/scripts/plot_matchingvars.R b/evaluations/scripts/plot_matchingvars.R new file mode 100644 index 0000000..ec47f01 --- /dev/null +++ b/evaluations/scripts/plot_matchingvars.R @@ -0,0 +1,42 @@ +plot_matching_variables <- function(data, ex_ante = 'false') { + + cont_data <- data %>% dplyr::select(type, elevation, slope, access, starts_with('cpc')) + cont_data[, 5:length(cont_data)] <- 100 * cont_data[, 5:length(cont_data)] # cpcs as percentages + cont_data <- melt(cont_data) + + # rename labels + cont_data$variable <- factor(cont_data$variable, + levels = c('access', 'cpc0_u', 'cpc0_d', + 'slope', 'cpc5_u', 'cpc5_d', + 'elevation', 'cpc10_u', 'cpc10_d')) + + levels(cont_data$variable) <- c('Inaccessibility', + 'Forest~cover~t[0]', + 'Deforestation~t[0]', + 'Slope', + 'Forest~cover~t[-5]', + 'Deforestation~t[-5]', + 'Elevation', + 'Forest~cover~t[-10]', + 'Deforestation~t[-10]') + + # determine labels based on ex_ante + if (ex_ante == 'false') { + plot_labels <- c('Counterfactual', 'Project') + } else if (ex_ante == 'true') { + plot_labels <- c('Matched points', 'Project')} + + # plot + matchingvars <- ggplot(data = cont_data, mapping = aes(x = value, colour = type)) + + geom_density(adjust = 10, size = 1) + + facet_wrap(~variable, scales = 'free', nrow = 3, labeller = label_parsed) + + ylab('Density') + + scale_colour_manual(values = c('blue', 'red'), labels = plot_labels) + + theme_classic() + + theme(legend.title = element_blank(), + axis.title.x = element_blank(), + axis.text.y = element_blank(), + axis.ticks.y = element_blank()) + + return(matchingvars) +} \ No newline at end of file diff --git a/evaluations/scripts/plot_transitions.R b/evaluations/scripts/plot_transitions.R new file mode 100644 index 0000000..2931a60 --- /dev/null +++ b/evaluations/scripts/plot_transitions.R @@ -0,0 +1,63 @@ +library(ggspatial) + +plot_transitions <- function(data,t0,period_length,shapefile){ + + # count number of 1s at project start + + t0_index <- grep(paste0('luc_',t0),colnames(data)) + + data_filtered <- data[data[,t0_index]==1,] + + # identify where there have been changes + + tend <- t0 + period_length + + luc_tend <- data_filtered[[paste0('luc_', tend)]] + + response <- case_when( + luc_tend==1 ~ NA, + luc_tend==2 ~ 'deg', + luc_tend==3 ~ 'def', + luc_tend==4 ~ 'ref', + luc_tend>4 ~ NA) + + data_filtered$response <- as.factor(response) + data_filtered <- data_filtered %>% filter(!is.na(response)) + + # adding deg --> def transition + + # count number of 2s at project start + + data_filtered_2s <- data[data[,t0_index]==2,] + + # identify where there have been changes + + luc_tend <- data_filtered_2s[[paste0('luc_', tend)]] + + response <- case_when( + luc_tend==1 ~ NA, + luc_tend==2 ~ NA, + luc_tend==3 ~ 'deg_to_def', + luc_tend==4 ~ NA, + luc_tend>4 ~ NA) + + data_filtered_2s$response <- as.factor(response) + data_filtered_2s <- data_filtered_2s %>% filter(!is.na(response)) + + combined_dat <- bind_rows(data_filtered, data_filtered_2s) + combined_dat$response <- factor(combined_dat$response, levels=c('deg','deg_to_def','def','ref')) + + # plotting + + plot <- combined_dat %>% + filter(response != 0) %>% + ggplot(aes(x=lng,y=lat,colour=response))+ + geom_sf(data=shapefile,inherit.aes=F,fill='grey80',colour=NA)+ + geom_point(alpha=0.5,size=0.5)+ + scale_colour_manual(values=c('yellow','orange','red','green'),name='Transition',labels=c('Undisturbed to degraded','Degraded to deforested','Undisturbed to deforested','Undisturbed to reforested'))+ + annotation_scale(text_cex = 1.3)+ + theme_void() + + return(plot) + +} diff --git a/evaluations/scripts/std_mean_diff.R b/evaluations/scripts/std_mean_diff.R new file mode 100644 index 0000000..63d81ba --- /dev/null +++ b/evaluations/scripts/std_mean_diff.R @@ -0,0 +1,57 @@ + +std_mean_diff <- function(path_to_pairs) { + + # clean data + + files_full_raw <- list.files(path_to_pairs, + pattern='*.parquet',full.names=T,recursive=F) + files_full <- files_full_raw[!grepl('matchless',files_full_raw)] + files_short_raw <- list.files(path=path_to_pairs, + pattern='*.parquet',full.names=F,recursive=F) + files_short <- files_short_raw[!grepl('matchless',files_short_raw)] + + # initialise dfs + + vars <- c(colnames(read_parquet(files_full[1])),'pair') + df <- data.frame(matrix(ncol=length(vars),nrow=0)) + colnames(df) <- vars + + for(j in 1:length(files_full)){ + + # read in all parquet files for a given project + + f <- data.frame(read_parquet(files_full[j])) + + # append data to bottom of df + + df <- bind_rows(df,f) + + } + + # calculate smd + + smd_results <- data.frame(variable = character(), smd = numeric(), stringsAsFactors = FALSE) + + variables <- c('cpc10_d','cpc5_d','cpc0_d', + 'cpc10_u','cpc5_u','cpc0_u', + 'access','slope','elevation') + + for (var in variables) { + k_var <- df[[paste0("k_", var)]] + s_var <- df[[paste0("s_", var)]] + + k_mean <- mean(k_var, na.rm = TRUE) + s_mean <- mean(s_var, na.rm = TRUE) + k_sd <- sd(k_var, na.rm = TRUE) + s_sd <- sd(s_var, na.rm = TRUE) + + pooled_sd <- sqrt((k_sd^2 + s_sd^2) / 2) + smd <- (k_mean - s_mean) / pooled_sd + + smd_results <- rbind(smd_results, data.frame(variable = var, smd = smd, stringsAsFactors = FALSE)) + } + + return(smd_results) +} + + \ No newline at end of file diff --git a/methods/inputs/generate_slope.py b/methods/inputs/generate_slope.py index 891e26f..7d58988 100644 --- a/methods/inputs/generate_slope.py +++ b/methods/inputs/generate_slope.py @@ -111,138 +111,123 @@ def generate_slope(input_elevation_directory: str, output_slope_directory: str): continue with tempfile.TemporaryDirectory() as tmpdir: - elevation = RasterLayer.layer_from_file(elev_path) - - logging.info("Area of elevation tile %a", elevation.area) - _easting, _northing, lower_code, lower_letter = utm.from_latlon( - elevation.area.bottom, elevation.area.left - ) - _easting, _northing, upper_code, upper_letter = utm.from_latlon( - elevation.area.top, elevation.area.right - ) - - # FAST PATH -- with only one UTM zone the reprojection back has no issues - if lower_code == upper_code and lower_letter == upper_letter: - actual_utm_code = lower_code - warp( - actual_utm_code, - elev_path, - elevation.pixel_scale.xstep, - elevation.pixel_scale.ystep, - out_path, + with RasterLayer.layer_from_file(elev_path) as elevation: + + logging.info("Area of elevation tile %a", elevation.area) + _easting, _northing, lower_code, lower_letter = utm.from_latlon( + elevation.area.bottom, elevation.area.left ) - else: - # SLOW PATH -- in the slow path, we have to break the elevation raster into - # UTM sections and do the above to each before reprojecting back and recombining - - # To capture the results here for later inspection just override the tmpdir variable - for actual_utm_code in range(lower_code, upper_code + 1): - for utm_letter in crange(lower_letter, upper_letter): - logging.debug("UTM(%s,%s)", actual_utm_code, utm_letter) - - # Note: we go a little bit around the UTM tiles and will crop them down to size later - # this is to remove some aliasing effects. - bbox = bounding_box_of_utm(actual_utm_code, utm_letter, UTM_EXPANSION_DEGREES) - - # Crop the elevation tile to a UTM zone - utm_layer = RasterLayer.empty_raster_layer_like( - elevation, area=bbox - ) - utm_id = f"{actual_utm_code}-{utm_letter}-{elevation_path}" - utm_clip_path = os.path.join(tmpdir, utm_id) - intersection = RasterLayer.find_intersection( - [elevation, utm_layer] - ) - result = RasterLayer.empty_raster_layer( - intersection, - elevation.pixel_scale, - elevation.datatype, - utm_clip_path, - elevation.projection, - ) - result.set_window_for_intersection(intersection) - elevation.set_window_for_intersection(intersection) - elevation.save(result) - - # Flush elevation utm clip to disk - del result - - # Now warp into UTM, calculate slopes, and warp back - slope_out_path = os.path.join(tmpdir, "out-slope-" + utm_id) - warp( - actual_utm_code, - utm_clip_path, - elevation.pixel_scale.xstep, - elevation.pixel_scale.ystep, - slope_out_path, - ) - - # We now recrop the out-slope back to the bounding box we assumed at the start - bbox_no_expand = bounding_box_of_utm( - actual_utm_code, utm_letter, 0.0 - ) - slope_tif = RasterLayer.layer_from_file(slope_out_path) - grid = RasterLayer.empty_raster_layer_like( - slope_tif, area=bbox_no_expand - ) - output_final = f"final-slope-{actual_utm_code}-{utm_letter}-{elevation_path}" - final_path = os.path.join(tmpdir, output_final) - logging.debug("Slope underlying %s", slope_tif._underlying_area) # pylint: disable=W0212 - logging.debug("Grid underling %s", grid._underlying_area) # pylint: disable=W0212 - try: - intersection = RasterLayer.find_intersection([slope_tif, grid]) - except ValueError: - logging.debug( - "UTM (%s, %s) didn't intersect actual area %s", - actual_utm_code, - utm_letter, - grid._underlying_area # pylint: disable=W0212 - ) - continue - slope_tif.set_window_for_intersection(intersection) - final = RasterLayer.empty_raster_layer( - intersection, - slope_tif.pixel_scale, - slope_tif.datatype, - final_path, - slope_tif.projection, - ) - logging.debug("Final underlying %s", final._underlying_area) # pylint: disable=W0212 - final.set_window_for_intersection(intersection) - slope_tif.save(final) - - # Flush - del final - - # Now to recombine the UTM gridded slopes into the slope tile - slopes = glob("final-slope-*", root_dir=tmpdir) - assert len(slopes) > 0 - - # This sets the order a little better for the union of the layers - slopes.sort() - slopes.reverse() - - logging.info("Render order %s", slopes) - - combined = GroupLayer( - [ - RasterLayer.layer_from_file(os.path.join(tmpdir, filename)) - for filename in slopes - ] + _easting, _northing, upper_code, upper_letter = utm.from_latlon( + elevation.area.top, elevation.area.right ) - elevation = RasterLayer.layer_from_file(elev_path) - intersection = RasterLayer.find_intersection([elevation, combined]) - combined.set_window_for_intersection(intersection) - elevation.set_window_for_intersection(intersection) - - assembled_path = os.path.join(tmpdir, "patched.tif") - result = RasterLayer.empty_raster_layer_like( - elevation, filename=assembled_path - ) - combined.save(result) + # FAST PATH -- with only one UTM zone the reprojection back has no issues + if lower_code == upper_code and lower_letter == upper_letter: + actual_utm_code = lower_code + warp( + actual_utm_code, + elev_path, + elevation.pixel_scale.xstep, + elevation.pixel_scale.ystep, + out_path, + ) + else: + # SLOW PATH -- in the slow path, we have to break the elevation raster into + # UTM sections and do the above to each before reprojecting back and recombining + + # To capture the results here for later inspection just override the tmpdir variable + for actual_utm_code in range(lower_code, upper_code + 1): + for utm_letter in crange(lower_letter, upper_letter): + logging.debug("UTM(%s,%s)", actual_utm_code, utm_letter) + + # Note: we go a little bit around the UTM tiles and will crop them down to size later + # this is to remove some aliasing effects. + bbox = bounding_box_of_utm(actual_utm_code, utm_letter, UTM_EXPANSION_DEGREES) + + # Crop the elevation tile to a UTM zone + with RasterLayer.empty_raster_layer_like(elevation, area=bbox) as utm_layer: + utm_id = f"{actual_utm_code}-{utm_letter}-{elevation_path}" + utm_clip_path = os.path.join(tmpdir, utm_id) + intersection = RasterLayer.find_intersection( + [elevation, utm_layer] + ) + with RasterLayer.empty_raster_layer( + intersection, + elevation.pixel_scale, + elevation.datatype, + utm_clip_path, + elevation.projection, + ) as result: + result.set_window_for_intersection(intersection) + elevation.set_window_for_intersection(intersection) + elevation.save(result) + + # Now warp into UTM, calculate slopes, and warp back + slope_out_path = os.path.join(tmpdir, "out-slope-" + utm_id) + warp( + actual_utm_code, + utm_clip_path, + elevation.pixel_scale.xstep, + elevation.pixel_scale.ystep, + slope_out_path, + ) - shutil.move(assembled_path, out_path) + # We now recrop the out-slope back to the bounding box we assumed at the start + bbox_no_expand = bounding_box_of_utm( + actual_utm_code, utm_letter, 0.0 + ) + with RasterLayer.layer_from_file(slope_out_path) as slope_tif: + with RasterLayer.empty_raster_layer_like(slope_tif, area=bbox_no_expand) as grid: + output_final = f"final-slope-{actual_utm_code}-{utm_letter}-{elevation_path}" + final_path = os.path.join(tmpdir, output_final) + logging.debug("Slope underlying %s", slope_tif._underlying_area) # pylint: disable=W0212 + logging.debug("Grid underling %s", grid._underlying_area) # pylint: disable=W0212 + try: + intersection = RasterLayer.find_intersection([slope_tif, grid]) + except ValueError: + logging.debug( + "UTM (%s, %s) didn't intersect actual area %s", + actual_utm_code, + utm_letter, + grid._underlying_area # pylint: disable=W0212 + ) + continue + slope_tif.set_window_for_intersection(intersection) + with RasterLayer.empty_raster_layer( + intersection, + slope_tif.pixel_scale, + slope_tif.datatype, + final_path, + slope_tif.projection, + ) as final: + logging.debug("Final underlying %s", final._underlying_area) # pylint: disable=W0212 + final.set_window_for_intersection(intersection) + slope_tif.save(final) + + # Now to recombine the UTM gridded slopes into the slope tile + slopes = glob("final-slope-*", root_dir=tmpdir) + assert len(slopes) > 0 + + # This sets the order a little better for the union of the layers + slopes.sort() + slopes.reverse() + + logging.info("Render order %s", slopes) + + files = [os.path.join(tmpdir, filename) for filename in slopes] + with GroupLayer.layer_from_files(files) as combined: + with RasterLayer.layer_from_file(elev_path) as elevation: + intersection = RasterLayer.find_intersection([elevation, combined]) + combined.set_window_for_intersection(intersection) + elevation.set_window_for_intersection(intersection) + + assembled_path = os.path.join(tmpdir, "patched.tif") + with RasterLayer.empty_raster_layer_like( + elevation, filename=assembled_path + ) as result: + combined.save(result) + + shutil.move(assembled_path, out_path) def main() -> None: diff --git a/methods/matching/build_m_table.py b/methods/matching/build_m_table.py index 66a27e2..ddf24e6 100644 --- a/methods/matching/build_m_table.py +++ b/methods/matching/build_m_table.py @@ -28,8 +28,8 @@ def build_m_table( matching_collection = build_layer_collection( merged_raster.pixel_scale, merged_raster.projection, - list(luc_range(start_year, evaluation_year)), - [start_year, start_year - 5, start_year - 10], + list(luc_range(start_year-10, start_year)), + [start_year-10, start_year - 15, start_year - 20], matching_zone_filename, jrc_directory_path, cpc_directory_path, @@ -44,7 +44,7 @@ def build_m_table( assert matching_collection.boundary.area == merged_raster.area results = [] - luc_columns = [f'luc_{year}' for year in luc_range(start_year, evaluation_year)] + luc_columns = [f'luc_{year}' for year in luc_range(start_year-10, start_year)] cpc_columns = ['cpc0_u', 'cpc0_d', 'cpc5_u', 'cpc5_d', 'cpc10_u', 'cpc10_d'] columns = ['lat', 'lng', 'ecoregion', 'elevation', 'slope', 'access', 'country'] + luc_columns + cpc_columns diff --git a/methods/matching/calculate_k.py b/methods/matching/calculate_k.py index dfecd05..a5fa156 100644 --- a/methods/matching/calculate_k.py +++ b/methods/matching/calculate_k.py @@ -21,10 +21,13 @@ LARGE_PROJECT_PIXEL_DENSITY_PER_HECTARE = 0.05 # The '2 *' in this is because I'm just considering one axis, rather than area -PIXEL_SKIP_SMALL_PROJECT = \ - round((HECTARE_WIDTH_IN_METERS / (2 * SMALL_PROJECT_PIXEL_DENSITY_PER_HECTARE)) / PIXEL_WIDTH_IN_METERS) -PIXEL_SKIP_LARGE_PROJECT = \ - round((HECTARE_WIDTH_IN_METERS / (2 * LARGE_PROJECT_PIXEL_DENSITY_PER_HECTARE)) / PIXEL_WIDTH_IN_METERS) +# PIXEL_SKIP_SMALL_PROJECT = \ +# round((HECTARE_WIDTH_IN_METERS / (2 * SMALL_PROJECT_PIXEL_DENSITY_PER_HECTARE)) / PIXEL_WIDTH_IN_METERS) +# PIXEL_SKIP_LARGE_PROJECT = \ +# round((HECTARE_WIDTH_IN_METERS / (2 * LARGE_PROJECT_PIXEL_DENSITY_PER_HECTARE)) / PIXEL_WIDTH_IN_METERS) + +PIXEL_SKIP_SMALL_PROJECT = 1 +PIXEL_SKIP_LARGE_PROJECT = 1 MatchingCollection = namedtuple('MatchingCollection', ['boundary', 'lucs', 'cpcs', 'ecoregions', 'elevation', 'slope', 'access', 'countries']) diff --git a/methods/matching/cluster_find_pairs_interactive.py b/methods/matching/cluster_find_pairs_interactive.py new file mode 100644 index 0000000..215722b --- /dev/null +++ b/methods/matching/cluster_find_pairs_interactive.py @@ -0,0 +1,374 @@ +# source myenv/bin/activate + +import pandas as pd +import numpy as np +from numba import njit +from sklearn.decomposition import PCA +from sklearn.cluster import KMeans +import matplotlib.pyplot as plt +import geopandas as gpd +from pyproj import Proj, transform +import os +import time +import sys +import faiss + +# Read K and M +# Repeat 100 times - Ultimately we would like to remove this step +# PCA across K and M +# Divide into clusters - Check how much splitting into clusters speeds things up +# Sample K to find K_sub +# Sample M to find M_sub +# Split M into LUC combinations - How big is each set? +# For each cluster in K_sub +# Split cluster in K_sub into categorical combinations - How big is each set? +# For each categorical combination +# Sample from the equivalent cluster and categorical combindation in M_sub +# Find pairs for the categorical combinations in K_sub from the categorical combinations in M_sub +# RowBind categorical combination sets +# RowBind cluster sets +# Save Pairs + + +# NOTES +# 1. We might need to combine some categorical subsets because at least for comparing validity of the matches +# because if the subsets are too small the standardised mean differences can be very wrong +# 2. One option would be to combine clusters based on the proximity of the cluster centres. For the LUCs, we might +# combine groups that are in the same state when the project begins, even if the LUC history is not the same. +# 3. There is a question of how much supposed additionality is created by each categorical subset? If it is +# nothing, it might not matter. If it is substantive then it definitely does matter. + + + +# this function uses loops instead of numpy vector operations +@njit +def loop_match(m_pca, k_pca): + picked = np.ones((m_pca.shape[0],), dtype=np.bool_) + fast_matches = np.full(k_pca.shape[0], -1, dtype=np.int32) + for i in range(0, k_pca.shape[0]): + min_squared_diff_sum = np.inf + min_squared_diff_j = -1 + for j in range(0, m_pca.shape[0]): + if picked[j]: + squared_diff_sum = np.sum((m_pca[j, :] - k_pca[i, :])**2) + if squared_diff_sum < min_squared_diff_sum: + min_squared_diff_sum = squared_diff_sum + min_squared_diff_j = j + fast_matches[i] = min_squared_diff_j + picked[min_squared_diff_j] = False + return fast_matches + +### Now try with real numbers + +def to_int32(x): + # Normalize the data to the range 0 to 1 + min_val = np.min(x) + max_val = np.max(x) + normalized_data = (x - min_val) / (max_val - min_val) + + # Scale the normalized data to the range 0 to 255 for unsigned 8-bit integers + scaled_data = normalized_data * 255 + + # Convert to 32-bit integers (0 to 255) + int32_data = scaled_data.astype(np.int32) + + return int32_data + +def to_pca_int32(x): + # Perform PCA and convert to dataframe + pca = PCA(n_components=min(len(x), len(x.columns)), + whiten=False) # Centering and scaling done by default + pca_result = pca.fit_transform(x) + pca_df = pd.DataFrame(pca_result) + # Convert all columns to int8 + pca_32 = pca_df.apply(to_int32) + return pca_32 + + +def calculate_smd(group1, group2): + # Means + mean1, mean2 = np.mean(group1), np.mean(group2) + # Standard deviations + std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1) + # Sample sizes + n1, n2 = len(group1), len(group2) + # Pooled standard deviation + pooled_std = np.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2)) + # Standardized mean difference + smd = (mean1 - mean2) / pooled_std + return smd, mean1, mean2, pooled_std + + +K_SUB_PROPORTION = 0.01 +M_SUB_PROPORTION = 0.1 +# Number of clusters +N_CLUSTERS = 9 +# Number of iterations for K means fitting +N_ITERATIONS = 100 +VERBOSE = True + +# Define the start year +t0 = 2012 # READ THIS IN + +# Read in the data +boundary = gpd.read_file('/maps/aew85/projects/1201.geojson') + +k_pixels = pd.read_parquet('/maps/tws36/tmf_pipe_out/1201/k.parquet') +# k_pixels = pd.read_parquet('/maps/tws36/tmf_pipe_out/1201/k_all.parquet') +m_pixels = pd.read_parquet('/maps/aew85/tmf_pipe_out/1201/matches.parquet') + + +t0 = 2018 +boundary = gpd.read_file('/maps/aew85/projects/ona.geojson') +k_pixels = pd.read_parquet('/maps/aew85/tmf_pipe_out/fastfp_test_ona/k.parquet') +m_pixels = pd.read_parquet('/maps/aew85/tmf_pipe_out/fastfp_test_ona/matches.parquet') + +if(m_pixels.shape[0] > (k_pixels.shape[0])): + m_sub_size = int(k_pixels.shape[0]) # First down sample M as it is ~230 million points + m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False) + m_pixels = m_pixels.iloc[m_random_indices] + +# # Calculate the central coordinates (centroid) +# central_lat = m_pixels['lat'].mean() +# central_lon = m_pixels['lng'].mean() +# aeqd_proj = f"+proj=aeqd +lat_0={central_lat} +lon_0={central_lon} +datum=WGS84" + +# # Convert the DataFrame to a GeoDataFrame +# m_gdf = gpd.GeoDataFrame(m_pixels, geometry=gpd.points_from_xy(m_pixels.lng, m_pixels.lat)) +# # Set the original CRS to WGS84 (EPSG:4326) +# m_gdf.set_crs(epsg=4326, inplace=True) + +# # Transform the GeoDataFrame to the AEQD projection +# m_gdf_aeqd = m_gdf.to_crs(aeqd_proj) + +# # Extract the transformed coordinates +# gdf_aeqd['aeqd_x'] = gdf_aeqd.geometry.x +# gdf_aeqd['aeqd_y'] = gdf_aeqd.geometry.y + +# # Define the grid resolution in meters +# grid_resolution_m = 5000 # 5 km + +# # Calculate grid cell indices +# gdf_aeqd['grid_x'] = (gdf_aeqd['aeqd_x'] // grid_resolution_m).astype(int) +# gdf_aeqd['grid_y'] = (gdf_aeqd['aeqd_y'] // grid_resolution_m).astype(int) + +# # Print the first few rows to verify +# print(gdf_aeqd.head()) + + +# concat m and k +km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))), + m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True) + +# Select columns (excluding 'x', 'y', 'lat', 'lng', 'country', 'ecoregion', 'trt', and those starting with 'luc') +exclude_columns = ['ID', 'x', 'y', 'lat', 'lng', 'country', 'ecoregion', 'trt'] +exclude_columns += [col for col in km_pixels.columns if col.startswith('luc')] + +# Extract only the continuous columns +continuous_columns = km_pixels.columns.difference(exclude_columns) +km_pixels_selected = km_pixels[continuous_columns] + +# PCA transform and conversion to 32 bit ints +km_pca = to_pca_int32(km_pixels_selected) + +# Looks good +km_pca.head() + +#------------------------------------------ + +# Initialize the KMeans object +kmeans = faiss.Kmeans(d=km_pca.shape[1], k=N_CLUSTERS, niter=N_ITERATIONS, verbose=True) +# Perform clustering +kmeans.train(km_pca) + +# Get cluster assignments +km_pixels['cluster'] = kmeans.index.search(km_pca, 1)[1].flatten() + +# Frequency distribution in each cluster +cluster_counts = pd.Series(km_pixels['cluster']).value_counts() +if VERBOSE: + print("Cluster counts:\n", cluster_counts) + + +# Convert to spatial (simple features) +km_pixels_sf = gpd.GeoDataFrame( + km_pixels, + geometry=gpd.points_from_xy(km_pixels['lng'], km_pixels['lat']), + crs="EPSG:4326" +) + + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Plot cluster centres + +# Get cluster centers +centroids = kmeans.centroids + +if VERBOSE: + # Plot the cluster centers + plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=100, marker='x', label='Cluster centers') + # Add cluster IDs as labels on the plot + for i, center in enumerate(centroids): + plt.text(center[0], center[1], str(i), color='red', fontsize=12, weight='bold') + + plt.title('K-means Clustering with Faiss') + plt.xlabel('PCA Component 1') + plt.ylabel('PCA Component 2') + plt.legend() + plt.show() + plt.savefig('Figures/ona_cluster_centres_faiss_1.png') + plt.close() # Close the plot to free up memory + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Plot clusters as separate facets +if VERBOSE: + fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 15)) + axes = axes.flatten() + + clusters = sorted(km_pixels_sf['cluster'].unique()) + + for i, cluster in enumerate(clusters): + ax = axes[i] + cluster_data = km_pixels_sf[km_pixels_sf['cluster'] == cluster] + cluster_data.plot(ax=ax, color='blue', markersize=0.2) + boundary.plot(ax=ax, edgecolor='black', facecolor='none') + ax.set_title(f'Cluster {cluster}') + ax.set_axis_off() + + # Turn off any unused subplots + for j in range(len(clusters), len(axes)): + fig.delaxes(axes[j]) + + plt.tight_layout() + plt.savefig('Figures/Ona_cluster_faiss_1_facet.png') + plt.close() + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +match_years = [t0-10, t0-5, t0] +match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years] + +# Extract K and M pixels +k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt'] +m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl'] + +# Extract K and M PCA transforms +k_pca = km_pca.loc[km_pixels['trt'] == 'trt'].to_numpy() +m_pca = km_pca.loc[km_pixels['trt'] == 'ctrl'].to_numpy() + +k_sub_size = int(k_pixels.shape[0]* K_SUB_PROPORTION) +m_sub_size = int(m_pixels.shape[0] * 1) + +# Define indexs for the samples from K and M +k_random_indices = np.random.choice(k_pixels.shape[0], size=k_sub_size, replace=False) +m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False) + +# Take random samples from K and M pixels +k_sub = k_pixels.iloc[k_random_indices] +m_sub = m_pixels.iloc[m_random_indices] + +# Take corresponding random samples from the PCA transformed K and M +k_sub_pca = k_pca[k_random_indices,:] +m_sub_pca = m_pca[m_random_indices,:] + +if VERBOSE: + # Handy code for displaying the number of counts in each unique category combination + # In K + k_combination_counts = k_sub.groupby(match_cats).size().reset_index(name='counts') + print("k_combination_counts") + print(k_combination_counts) + # In M + m_combination_counts = m_sub.groupby(match_cats).size().reset_index(name='counts') + print("m_combination_counts") + print(m_combination_counts) + + +# Identify the unique combinations of luc columns +k_cat_combinations = k_sub[match_cats].drop_duplicates().sort_values(by=match_cats, ascending=[True] * len(match_cats)) + +pairs_list = [] + +start_time = time.time() +for i in range(0, k_cat_combinations.shape[0]): + # i = 6 # ith element of the unique combinations of the luc time series in k + # for in range() + k_cat_comb = k_cat_combinations.iloc[i] + k_cat = k_sub[(k_sub[match_cats] == k_cat_comb).all(axis=1)] + k_cat_pca = k_sub_pca[(k_sub[match_cats] == k_cat_comb).all(axis=1)] + + # Find the subset in km_pixels that matches this combination + m_cat = m_sub[(m_sub[match_cats] == k_cat_comb).all(axis=1)] + m_cat_pca = m_sub_pca[(m_sub[match_cats] == k_cat_comb).all(axis=1)] + + if VERBOSE: + print('ksub_cat:' + str(k_cat.shape[0])) + print('msub_cat:' + str(m_cat.shape[0])) + + # If there is no suitable match for the pre-project luc time series + # Then it may be preferable to just take the luc state at t0 + # m_luc_comb = m_pixels[(m_pixels[match_luc_years[1:3]] == K_luc_comb[1:3]).all(axis=1)] + # m_luc_comb = m_pixels[(m_pixels[match_luc_years[2:3]] == K_luc_comb[2:3]).all(axis=1)] + # For if there are no matches return nothing + + if(m_cat.shape[0] < k_cat.shape[0] * 5): + print("M insufficient for matching. Set VERBOSE to True for more details.") + continue + + matches_index = loop_match(m_cat_pca, k_cat_pca) + m_cat_matches = m_cat.iloc[matches_index] + + # i = 0 + # matched = pd.concat([k_cat.iloc[i], m_cat.iloc[matches[i]]], axis=1, ignore_index=True) + # matched.columns = ['trt', 'ctrl'] + # matched + #Looks great! + columns_to_compare = ['access', 'cpc0_d', 'cpc0_u', 'cpc10_d', 'cpc10_u', 'cpc5_d', 'cpc5_u', 'elevation', 'slope'] + # Calculate SMDs for the specified columns + smd_results = [] + for column in columns_to_compare: + smd, mean1, mean2, pooled_std = calculate_smd(k_cat[column], m_cat_matches[column]) + smd_results.append((column, smd, mean1, mean2, pooled_std)) + + # Convert the results to a DataFrame for better readability + smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std']) + + if VERBOSE: + # Print the results + print("categorical combination:") + print(k_cat_comb) + # Count how many items in 'column1' are not equal to the specified integer value + print("LUC flips in K:") + (k_cat['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum() + print("LUC flips in matches:") + (m_cat_matches['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum() + print("Standardized Mean Differences:") + print(smd_df) + + # Join the pairs into one dataframe: + k_cat = k_cat.reset_index(drop = True) + m_cat_matches = m_cat_matches.reset_index(drop = True) + pairs_df = pd.concat([k_cat.add_prefix('k_'), m_cat_matches.add_prefix('s_')], axis=1) + + # Append the resulting DataFrame to the list + pairs_list.append(pairs_df) + +# Combine all the DataFrames in the list into a single DataFrame +combined_pairs = pd.concat(pairs_list, ignore_index=True) + +end_time = time.time() +elapsed_time = end_time - start_time +if VERBOSE: + print(f"Elapsed time: {elapsed_time:.2f} seconds") + +columns_to_compare = ['access', 'cpc0_d', 'cpc0_u', 'cpc10_d', 'cpc10_u', 'cpc5_d', 'cpc5_u', 'elevation', 'slope'] +# Calculate SMDs for the specified columns +smd_results = [] +for column in columns_to_compare: + smd, mean1, mean2, pooled_std = calculate_smd(combined_pairs['k_' + column], combined_pairs['s_' + column]) + smd_results.append((column, smd, mean1, mean2, pooled_std)) + +# Convert the results to a DataFrame for better readability +smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std']) +print(smd_df) + + diff --git a/methods/matching/find_pairs.py b/methods/matching/find_pairs.py index 7f17782..668736b 100644 --- a/methods/matching/find_pairs.py +++ b/methods/matching/find_pairs.py @@ -3,16 +3,26 @@ import logging from functools import partial from multiprocessing import Pool, cpu_count, set_start_method -from numba import jit # type: ignore +from numba import njit # type: ignore import numpy as np import pandas as pd +from sklearn.decomposition import PCA +import faiss from methods.common.luc import luc_matching_columns REPEAT_MATCH_FINDING = 100 -DEFAULT_DISTANCE = 10000000.0 DEBUG = False +K_SUB_PROPORTION = 0.01 +M_SUB_PROPORTION = 1 +# Number of clusters +NUM_CLUSTERS = 9 +# Number of iterations for K means fitting +NUM_ITERATIONS = 100 +RELATIVE_MATCH_YEARS = [-10, -5, 0] + + DISTANCE_COLUMNS = [ "elevation", "slope", "access", "cpc0_u", "cpc0_d", @@ -23,297 +33,246 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") +# this function uses loops instead of numpy vector operations +@njit +def loop_match(m_pca, k_pca): + picked = np.ones((m_pca.shape[0],), dtype=np.bool_) + fast_matches = np.full(k_pca.shape[0], -1, dtype=np.int32) + for i in range(0, k_pca.shape[0]): + min_squared_diff_sum = np.inf + min_squared_diff_j = -1 + for j in range(0, m_pca.shape[0]): + if picked[j]: + squared_diff_sum = np.sum((m_pca[j, :] - k_pca[i, :])**2) + if squared_diff_sum < min_squared_diff_sum: + min_squared_diff_sum = squared_diff_sum + min_squared_diff_j = j + fast_matches[i] = min_squared_diff_j + picked[min_squared_diff_j] = False + return fast_matches + +### Now try with real numbers + +def to_int32(x): + # Normalize the data to the range 0 to 1 + min_val = np.min(x) + max_val = np.max(x) + normalized_data = (x - min_val) / (max_val - min_val) + # Scale the normalized data to the range 0 to 255 for unsigned 8-bit integers + scaled_data = normalized_data * 255 + # Convert to 32-bit integers (0 to 255) + int32_data = scaled_data.astype(np.int32) + return int32_data + +def to_pca_int32(x): + # Perform PCA and convert to dataframe + pca = PCA(n_components=min(len(x), len(x.columns)), + whiten=False) # Centering and scaling done by default + pca_result = pca.fit_transform(x) + pca_df = pd.DataFrame(pca_result) + # Convert all columns to int8 + pca_32 = pca_df.apply(to_int32) + return pca_32 + + +def calculate_smd(group1, group2): + # Means + mean1, mean2 = np.mean(group1), np.mean(group2) + # Standard deviations + std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1) + # Sample sizes + n1, n2 = len(group1), len(group2) + # Pooled standard deviation + pooled_std = np.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2)) + # Standardized mean difference + smd = (mean1 - mean2) / pooled_std + return smd, mean1, mean2, pooled_std + +def rename_luc_columns(df, start_year, eval_year): + + # Define the range of years based on the central start_year + no_years_post = (eval_year - start_year) + 1 + years = range(start_year - 10, start_year + no_years_post) # Adjust the range as needed + new_column_names = {f'luc_{year}': f'luc_{year - start_year}' for year in years} + + # Rename columns based on the new column names mapping + renamed_df = df.rename(columns=new_column_names) + + return renamed_df + def find_match_iteration( - k_parquet_filename: str, - m_parquet_filename: str, + k_pixels: pd.DataFrame, + m_pixels: pd.DataFrame, start_year: int, + eval_year: int, + luc_match: bool, output_folder: str, idx_and_seed: tuple[int, int] ) -> None: logging.info("Find match iteration %d of %d", idx_and_seed[0] + 1, REPEAT_MATCH_FINDING) rng = np.random.default_rng(idx_and_seed[1]) - - logging.info("Loading K from %s", k_parquet_filename) - - # Methodology 6.5.7: For a 10% sample of K - k_set = pd.read_parquet(k_parquet_filename) - k_subset = k_set.sample( - frac=0.1, - random_state=rng - ).reset_index() - - logging.info("Loading M from %s", m_parquet_filename) - m_set = pd.read_parquet(m_parquet_filename) - - # get the column ids for DISTANCE_COLUMNS - thresholds_for_columns = np.array([ - 200.0, # Elev - 2.5, # Slope - 10.0, # Access - 0.1, # CPCs - 0.1, # CPCs - 0.1, # CPCs - 0.1, # CPCs - 0.1, # CPCs - 0.1, # CPCs - ]) - - logging.info("Preparing s_set...") - - m_dist_thresholded_df = m_set[DISTANCE_COLUMNS] / thresholds_for_columns - k_subset_dist_thresholded_df = k_subset[DISTANCE_COLUMNS] / thresholds_for_columns - - # convert to float32 numpy arrays and make them contiguous for numba to vectorise - m_dist_thresholded = np.ascontiguousarray(m_dist_thresholded_df, dtype=np.float32) - k_subset_dist_thresholded = np.ascontiguousarray(k_subset_dist_thresholded_df, dtype=np.float32) - - # LUC columns are all named with the year in, so calculate the column names - # for the years we are intested in - luc0, luc5, luc10 = luc_matching_columns(start_year) - # As well as all the LUC columns for later use - luc_columns = [x for x in m_set.columns if x.startswith('luc')] - - hard_match_columns = ['country', 'ecoregion', luc10, luc5, luc0] - assert len(hard_match_columns) == HARD_COLUMN_COUNT - - # similar to the above, make the hard match columns contiguous float32 numpy arrays - m_dist_hard = np.ascontiguousarray(m_set[hard_match_columns].to_numpy()).astype(np.int32) - k_subset_dist_hard = np.ascontiguousarray(k_subset[hard_match_columns].to_numpy()).astype(np.int32) - - # Methodology 6.5.5: S should be 10 times the size of K, in order to achieve this for every - # pixel in the subsample (which is 10% the size of K) we select 100 pixels. - required = 100 - - logging.info("Running make_s_set_mask... required: %d", required) - starting_positions = rng.integers(0, int(m_dist_thresholded.shape[0]), int(k_subset_dist_thresholded.shape[0])) - s_set_mask_true, no_potentials = make_s_set_mask( - m_dist_thresholded, - k_subset_dist_thresholded, - m_dist_hard, - k_subset_dist_hard, - starting_positions, - required - ) - - logging.info("Done make_s_set_mask. s_set_mask.shape: %a", {s_set_mask_true.shape}) - - s_set = m_set[s_set_mask_true] - potentials = np.invert(no_potentials) - - k_subset = k_subset[potentials] - logging.info("Finished preparing s_set. shape: %a", {s_set.shape}) - - # Notes: - # 1. Not all pixels may have matches - results = [] - matchless = [] - - s_set_for_cov = s_set[DISTANCE_COLUMNS] - logging.info("Calculating covariance...") - covarience = np.cov(s_set_for_cov, rowvar=False) - logging.info("Calculating inverse covariance...") - invconv = np.linalg.inv(covarience).astype(np.float32) - - # Match columns are luc10, luc5, luc0, "country" and "ecoregion" - s_set_match = s_set[hard_match_columns + DISTANCE_COLUMNS].to_numpy(dtype=np.float32) - # this is required so numba can vectorise the loop in greedy_match - s_set_match = np.ascontiguousarray(s_set_match) - - # Now we do the same thing for k_subset - k_subset_match = k_subset[hard_match_columns + DISTANCE_COLUMNS].to_numpy(dtype=np.float32) - # this is required so numba can vectorise the loop in greedy_match - k_subset_match = np.ascontiguousarray(k_subset_match) - - logging.info("Starting greedy matching... k_subset_match.shape: %s, s_set_match.shape: %s", - k_subset_match.shape, s_set_match.shape) - - add_results, k_idx_matchless = greedy_match( - k_subset_match, - s_set_match, - invconv + + match_years = [start_year + year for year in RELATIVE_MATCH_YEARS] + # The categorical columns: + if luc_match: + match_cats = ["ecoregion", "country", "cluster"] + ["luc_-10", "luc_-5", "luc_0"] + else: + match_cats = ["ecoregion", "country", "cluster"] + + if(m_pixels.shape[0] > (k_pixels.shape[0])): + m_sub_size = int(k_pixels.shape[0]) # First down sample M as it is ~230 million points + m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False) + m_pixels = m_pixels.iloc[m_random_indices] + + # rename columns of each + k_pixels_renamed = rename_luc_columns(k_pixels, start_year, eval_year) + m_pixels_renamed = rename_luc_columns(m_pixels, start_year-10, eval_year) + + # concat m and k + km_pixels = pd.concat([k_pixels_renamed.assign(trt='trt', ID=range(0, len(k_pixels))), + m_pixels_renamed.assign(trt='ctrl', + ID=range(0, len(m_pixels)))], + ignore_index=True) + + # Extract only the continuous columns + km_pixels_distance = km_pixels[DISTANCE_COLUMNS] + # PCA transform and conversion to 32 bit ints + logging.info("Transforming continuous variables to PCA space") + km_pca = to_pca_int32(km_pixels_distance) + # Find clusters using Kmeans + logging.info("Starting cluster assignment using kmeans") + # Initialize the KMeans object + kmeans = faiss.Kmeans(d=km_pca.shape[1], k=NUM_CLUSTERS, niter=NUM_ITERATIONS, verbose=True) + # Perform clustering + kmeans.train(km_pca) + # Get cluster assignments + km_pixels['cluster'] = kmeans.index.search(km_pca, 1)[1].flatten() + + # Extract K and M pixels + k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt'] + m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl'] + # Extract K and M PCA transforms + k_pca = km_pca.loc[km_pixels['trt'] == 'trt'].to_numpy() + m_pca = km_pca.loc[km_pixels['trt'] == 'ctrl'].to_numpy() + # Draw subsamples + # Methodology 6.5.7: Needs to be updated !!! + k_sub_size = int(k_pixels.shape[0]* K_SUB_PROPORTION) + m_sub_size = int(m_pixels.shape[0] * M_SUB_PROPORTION) + # Define indexs for the samples from K and M + k_random_indices = np.random.choice(k_pixels.shape[0], size=k_sub_size, replace=False) + m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False) + # Take random samples from K and M pixels + k_sub = k_pixels.iloc[k_random_indices] + m_sub = m_pixels.iloc[m_random_indices] + # Take corresponding random samples from the PCA transformed K and M + k_sub_pca = k_pca[k_random_indices,:] + m_sub_pca = m_pca[m_random_indices,:] + + logging.info("Starting greedy matching... k_sub.shape: %s, m_sub.shape: %s", + k_sub.shape, m_sub.shape) + + pairs, matchless = greedy_match( + k_sub, + m_sub, + k_sub_pca, + m_sub_pca, + match_cats ) - + + # Combine all the pairs DataFrames in the list into a single DataFrame + combined_pairs = pd.concat(pairs, ignore_index=True) + # Combine all the matchess DataFrames in the list into a single DataFrame + combined_matchless = pd.concat(matchless, ignore_index=True) logging.info("Finished greedy matching...") - + logging.info("Starting storing matches...") - - for result in add_results: - (k_idx, s_idx) = result - k_row = k_subset.iloc[k_idx] - match = s_set.iloc[s_idx] - - if DEBUG: - for hard_match_column in hard_match_columns: - if k_row[hard_match_column] != match[hard_match_column]: - print(k_row) - print(match) - raise ValueError("Hard match inconsistency") - - results.append( - [k_row.lat, k_row.lng] + [k_row[x] for x in luc_columns + DISTANCE_COLUMNS] + \ - [match.lat, match.lng] + [match[x] for x in luc_columns + DISTANCE_COLUMNS] - ) - - logging.info("Finished storing matches...") - - for k_idx in k_idx_matchless: - k_row = k_subset.iloc[k_idx] - matchless.append(k_row) - - columns = ['k_lat', 'k_lng'] + \ - [f'k_{x}' for x in luc_columns + DISTANCE_COLUMNS] + \ - ['s_lat', 's_lng'] + \ - [f's_{x}' for x in luc_columns + DISTANCE_COLUMNS] - - results_df = pd.DataFrame(results, columns=columns) - results_df.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}.parquet')) - - matchless_df = pd.DataFrame(matchless, columns=k_set.columns) - matchless_df.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}_matchless.parquet')) - + combined_pairs_df = pd.DataFrame(combined_pairs) + combined_pairs_df.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}.parquet')) + + combined_matchless_df = pd.DataFrame(combined_matchless) + combined_matchless_df.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}_matchless.parquet')) + logging.info("Finished find match iteration") -@jit(nopython=True, fastmath=True, error_model="numpy") -def make_s_set_mask( - m_dist_thresholded: np.ndarray, - k_subset_dist_thresholded: np.ndarray, - m_dist_hard: np.ndarray, - k_subset_dist_hard: np.ndarray, - starting_positions: np.ndarray, - required: int -): - m_size = m_dist_thresholded.shape[0] - k_size = k_subset_dist_thresholded.shape[0] - - s_include = np.zeros(m_size, dtype=np.bool_) - k_miss = np.zeros(k_size, dtype=np.bool_) - - for k in range(k_size): - matches = 0 - k_row = k_subset_dist_thresholded[k, :] - k_hard = k_subset_dist_hard[k] - - for index in range(m_size): - m_index = (index + starting_positions[k]) % m_size - - m_row = m_dist_thresholded[m_index, :] - m_hard = m_dist_hard[m_index] - - should_include = True - - # check that every element of m_hard matches k_hard - hard_equals = True - for j in range(m_hard.shape[0]): - if m_hard[j] != k_hard[j]: - hard_equals = False - - if not hard_equals: - should_include = False - else: - for j in range(m_row.shape[0]): - if abs(m_row[j] - k_row[j]) > 1.0: - should_include = False - - if should_include: - s_include[m_index] = True - matches += 1 - - # Don't find any more M's - if matches == required: - break - - k_miss[k] = matches == 0 - - return s_include, k_miss - -# Function which returns a boolean array indicating whether all values in a row are true -@jit(nopython=True, fastmath=True, error_model="numpy") -def rows_all_true(rows: np.ndarray): - # Don't use np.all because not supported by numba - - # Create an array of booleans for rows in s still available - all_true = np.ones((rows.shape[0],), dtype=np.bool_) - for row_idx in range(rows.shape[0]): - for col_idx in range(rows.shape[1]): - if not rows[row_idx, col_idx]: - all_true[row_idx] = False - break - - return all_true - - -@jit(nopython=True, fastmath=True, error_model="numpy") def greedy_match( - k_subset: np.ndarray, - s_set: np.ndarray, - invcov: np.ndarray + k_sub: pd.DataFrame, + m_sub: pd.DataFrame, + k_sub_pca: np.ndarray, + m_sub_pca: np.ndarray, + match_cats: list ): - # Create an array of booleans for rows in s still available - s_available = np.ones((s_set.shape[0],), dtype=np.bool_) - total_available = s_set.shape[0] - - results = [] + # Identify the unique combinations of categorical columns + k_cat_combinations = k_sub[match_cats].drop_duplicates().sort_values(by=match_cats, ascending=[True] * len(match_cats)) + + # Not all pixels may have matches + pairs = [] matchless = [] - - s_tmp = np.zeros((s_set.shape[0],), dtype=np.float32) - - for k_idx in range(k_subset.shape[0]): - k_row = k_subset[k_idx, :] - - hard_matches = rows_all_true(s_set[:, :HARD_COLUMN_COUNT] == k_row[:HARD_COLUMN_COUNT]) & s_available - hard_matches = hard_matches.reshape( - -1, - ) - - if total_available > 0: - # Now calculate the distance between the k_row and all the hard matches we haven't already matched - s_tmp[hard_matches] = batch_mahalanobis_squared( - s_set[hard_matches, HARD_COLUMN_COUNT:], k_row[HARD_COLUMN_COUNT:], invcov - ) - # Find the index of the minimum distance in s_tmp[hard_matches] but map it back to the index in s_set - if np.any(hard_matches): - min_dist_idx = np.argmin(s_tmp[hard_matches]) - min_dist_idx = np.arange(s_tmp.shape[0])[hard_matches][min_dist_idx] - - results.append((k_idx, min_dist_idx)) - s_available[min_dist_idx] = False - total_available -= 1 - else: - matchless.append(k_idx) - - return (results, matchless) - -# optimised batch implementation of mahalanobis distance that returns a distance per row -@jit(nopython=True, fastmath=True, error_model="numpy") -def batch_mahalanobis_squared(rows, vector, invcov): - # calculate the difference between the vector and each row (broadcasted) - diff = rows - vector - # calculate the distance for each row in one batch - dists = (np.dot(diff, invcov) * diff).sum(axis=1) - return dists - + + for i in range(0, k_cat_combinations.shape[0]): + # i = 6 # ith element of the unique combinations of the luc time series in k + # for in range() + k_cat_comb = k_cat_combinations.iloc[i] + k_cat_index = k_sub[match_cats] == k_cat_comb + k_cat = k_sub[(k_cat_index).all(axis=1)] + k_cat_pca = k_sub_pca[(k_cat_index).all(axis=1)] + + # Find the subset in km_pixels that matches this combination + m_cat_index = m_sub[match_cats] == k_cat_comb + m_cat = m_sub[(m_cat_index).all(axis=1)] + m_cat_pca = m_sub_pca[(m_cat_index).all(axis=1)] + + # If there is no suitable match for the pre-project luc time series + # Then it may be preferable to just take the luc state at t0 + # m_luc_comb = m_pixels[(m_pixels[match_luc_years[1:3]] == K_luc_comb[1:3]).all(axis=1)] + # m_luc_comb = m_pixels[(m_pixels[match_luc_years[2:3]] == K_luc_comb[2:3]).all(axis=1)] + # For now if there are no matches return nothing + + if(m_cat.shape[0] < k_cat.shape[0] * 5): + matchless.append(k_cat) + continue + + matches_index = loop_match(m_cat_pca, k_cat_pca) + m_cat_matches = m_cat.iloc[matches_index] + + # Join the pairs into one dataframe: + k_cat = k_cat.reset_index(drop = True) + m_cat_matches = m_cat_matches.reset_index(drop = True) + pairs_df = pd.concat([k_cat.add_prefix('k_'), m_cat_matches.add_prefix('s_')], axis=1) + # Append the resulting DataFrame to the list + pairs.append(pairs_df) + + return (pairs, matchless) def find_pairs( k_parquet_filename: str, m_parquet_filename: str, start_year: int, + eval_year: int, + luc_match: bool, seed: int, output_folder: str, processes_count: int ) -> None: + logging.info("Loading K from %s", k_parquet_filename) + k_pixels = pd.read_parquet(k_parquet_filename) + logging.info("Loading M from %s", m_parquet_filename) + m_pixels = pd.read_parquet(m_parquet_filename) + logging.info("Starting find pairs") os.makedirs(output_folder, exist_ok=True) - + rng = np.random.default_rng(seed) iteration_seeds = zip(range(REPEAT_MATCH_FINDING), rng.integers(0, 1000000, REPEAT_MATCH_FINDING)) - + with Pool(processes=processes_count) as pool: pool.map( partial( find_match_iteration, - k_parquet_filename, - m_parquet_filename, + k_pixels, + m_pixels, start_year, + eval_year, + luc_match, output_folder ), iteration_seeds @@ -346,6 +305,20 @@ def main(): dest="start_year", help="Year project started." ) + parser.add_argument( + "--eval_year", + type=int, + required=True, + dest="eval_year", + help="Year of evaluation." + ) + parser.add_argument( + "--luc_match", + type=bool, + required=True, + dest="luc_match", + help="Boolean determines whether matching should include LUCs." + ) parser.add_argument( "--seed", type=int, @@ -374,6 +347,8 @@ def main(): args.k_filename, args.m_filename, args.start_year, + args.eval_year, + args.luc_match, args.seed, args.output_directory_path, args.processes_count diff --git a/methods/matching/find_potential_matches.py b/methods/matching/find_potential_matches.py index f69865a..51ae8bf 100644 --- a/methods/matching/find_potential_matches.py +++ b/methods/matching/find_potential_matches.py @@ -119,8 +119,8 @@ def worker( matching_collection = build_layer_collection( example_jrc_layer.pixel_scale, example_jrc_layer.projection, - [start_year, start_year - 5, start_year - 10], - [start_year, start_year - 5, start_year - 10], + [start_year - 10, start_year - 15, start_year - 20], # create time offset in matching set + [start_year - 10, start_year - 15, start_year - 20], # create time offset in matching set matching_zone_filename, jrc_directory_path, cpc_directory_path, diff --git a/requirements.txt b/requirements.txt index bad4673..8118a36 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ scipy numba matplotlib geojson -git+https://github.com/quantifyearth/yirgacheffe@bd2e91c773a414f66340ebb8c13044a1b1a6045f +git+https://github.com/quantifyearth/yirgacheffe@cc89b4d8a0e97c1a11b730cd688a58b680023336 git+https://github.com/carboncredits/biomass-recovery@9e54f80832a7eca915ebd13b03df9db2a08aee9d # developement diff --git a/scripts/tmfpython.sh b/scripts/tmfpython.sh index beb8178..fdecea6 100755 --- a/scripts/tmfpython.sh +++ b/scripts/tmfpython.sh @@ -1,26 +1,24 @@ #!/bin/bash -#run with command: scripts/tmfpython.sh -p 1113 -t 2010 ... -#p: project ID +#run with command: scripts/tmfpython.sh -i '/maps/aew85/projects' -o '/maps/aew85/tmf_pipe_out' -p 1113 -t 2010 ... +#i: input dir - directory containing project shapefiles +#o: output dir - directory containing pipeline outputs +#p: project name/ID - must match name of shapefile #t: year of project start (t0) #e: evaluation year (default: 2022) -#r: whether to run an ex-post evaluation and knit the results in an R notebook (true/false, default: false). -#a: whether to run an ex-ante evaluation and knit the results in an R notebook (true/false, default: false). - -#NB running evaluations requires the evaluations code +#v: report - whether to run an ex-ante evaluation and knit the results in an R notebook (true/false, default: true). # Check which branch is currently checked out -current_branch=$(git rev-parse --abbrev-ref HEAD) +branch=$(git rev-parse --abbrev-ref HEAD) set -e ############ DEFAULTS ############### -input_dir="/maps/aew85/projects" -output_dir="/maps/aew85/tmf_pipe_out" +input_dir="" +output_dir="" eval_year=2022 -ex_post=false -ex_ante=false +report=true ##################################### @@ -28,43 +26,45 @@ function display_help() { echo "Usage: $0 [options]" echo echo "Options:" - echo " -p Project name" + echo " -i Input directory" + echo " -o Output directory" + echo " -p Project name" echo " -t Start year" echo " -e Evaluation year" - echo " -r Knit ex post evaluation? (true/false)" - echo " -a Knit ex ante evaluation? (true/false)" + echo " -r Knit ex ante evaluation as .Rmd? (true/false)" echo " -h Display this help message" - echo echo "Example:" - echo " $0 -p 'gola' -t 2012 -e 2021 -r true -a true" + echo " $0 -i '/maps/aew85/projects' -o '/maps/aew85/tmf_pipe_out -p 1201 -t 2012" } # Parse arguments -while getopts "p:t:e:r:a:h" flag +while getopts "i:o:p:t:e:v:h" flag do case "${flag}" in + i) input_dir=${OPTARG};; + o) output_dir=${OPTARG};; p) proj=${OPTARG};; t) t0=${OPTARG};; e) eval_year=${OPTARG};; - r) ex_post=${OPTARG};; - a) ex_ante=${OPTARG};; + r) report=${OPTARG};; h) display_help; exit 0;; *) echo "Invalid option: -${OPTARG}" >&2; display_help; exit 1;; esac done +echo "Input directory: $input_dir" +echo "Output directory: $output_dir" echo "Project: $proj" echo "t0: $t0" echo "Evaluation year: $eval_year" -echo "Ex-post evaluation: $ex_post" -echo "Ex-ante evaluation: $ex_ante" +echo "Ex-ante evaluation: $report" if [ $# -eq 0 ]; then display_help exit 1 fi -# Make project output directory +# Make project output folder mkdir -p "${output_dir}/${proj}" echo "--Folder created.--" @@ -181,47 +181,21 @@ tmfpython3 -m methods.matching.build_m_table \ echo "--Set M created.--" #Matching: find pairs -tmfpython3 -m methods.matching.find_pairs \ - --k "${output_dir}/${proj}/k.parquet" \ - --m "${output_dir}/${proj}/matches.parquet" \ - --start_year "$t0" \ - --output "${output_dir}/${proj}/pairs" \ - --seed 42 \ - -j 1 - echo "--Pairs matched.--" - -#Calculate additionality -if [ "$current_branch" == "mwd-check-stopping-criteria" ]; then - tmfpython3 -m methods.outputs.calculate_additionality \ - --project "${input_dir}/${proj}.geojson" \ - --project_start "$t0" \ - --evaluation_year "$eval_year" \ - --density "${output_dir}/${proj}/carbon-density.csv" \ - --matches "${output_dir}/${proj}/pairs" \ - --output "${output_dir}/${proj}/additionality.csv" \ - --stopping "${output_dir}/${proj}/stopping.csv" - echo "--Additionality and stopping criteria calculated.--" - else - tmfpython3 -m methods.outputs.calculate_additionality \ - --project "${input_dir}/${proj}.geojson" \ - --project_start "$t0" \ - --evaluation_year "$eval_year" \ - --density "${output_dir}/${proj}/carbon-density.csv" \ - --matches "${output_dir}/${proj}/pairs" \ - --output "${output_dir}/${proj}/additionality.csv" - echo "--Additionality calculated.--" -fi - -# Run ex post evaluation -if [ "$ex_post" == "true" ]; then -evaluations_dir="~/evaluations" -ep_output_file="${evaluations_dir}/${proj}_ex_post_evaluation.html" -Rscript -e "rmarkdown::render(input='~/evaluations/R/ex_post_evaluation_template.Rmd',output_file='${ep_output_file}',params=list(proj='${proj}',t0='${t0}',eval_year='${eval_year}',input_dir='${input_dir}',output_dir='${output_dir}',evaluations_dir='${evaluations_dir}'))" -fi +. ./venv/bin/activate +python3 -m methods.matching.find_pairs \ +--k "${output_dir}/${proj}/k.parquet" \ +--m "${output_dir}/${proj}/matches.parquet" \ +--start_year "$t0" \ +--eval_year "$eval_year" \ +--luc_match True \ +--output "${output_dir}/${proj}/pairs" \ +--seed 42 \ +-j 1 +echo "--Pairs matched.--" +deactivate # Run ex-ante evaluation -if [ "$ex_ante" == "true" ]; then -evaluations_dir="~/evaluations" -ea_output_file="${evaluations_dir}/${proj}_ex_ante_evaluation.html" -Rscript -e "rmarkdown::render(input='~/evaluations/R/ex_ante_evaluation_template.Rmd',output_file='${ea_output_file}',params=list(proj='${proj}',t0='${t0}',eval_year='${eval_year}',input_dir='${input_dir}',output_dir='${output_dir}',evaluations_dir='${evaluations_dir}'))" +if [ "$report" == "true" ]; then +ea_output_file="${output_dir}/${proj}_ex_ante_evaluation.html" +Rscript -e "rmarkdown::render(input='evaluations/ex_ante_evaluation_template.Rmd',output_file='${ea_output_file}',params=list(proj='${proj}',t0='${t0}',input_dir='${input_dir}',output_dir='${output_dir}',branch='${branch}'))" fi \ No newline at end of file