Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 63 additions & 55 deletions methods/matching/find_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pandas as pd

from methods.common.luc import luc_matching_columns
from methods.utils.kd_tree import make_kdrangetree, make_rumba_tree

REPEAT_MATCH_FINDING = 100
DEFAULT_DISTANCE = 10000000.0
Expand Down Expand Up @@ -37,10 +38,8 @@ def find_match_iteration(

# Methodology 6.5.7: For a 10% sample of K
k_set = pd.read_parquet(k_parquet_filename)
k_subset = k_set.sample(
frac=0.1,
random_state=rng
).reset_index()
# TODO: This assumes the methodolgy is being updated to 100% of K
k_subset = k_set
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just collapse this change throughout, and when this is merged we bump versions of both the code and the methodology.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do when I merge


logging.info("Loading M from %s", m_parquet_filename)
m_set = pd.read_parquet(m_parquet_filename)
Expand Down Expand Up @@ -76,23 +75,28 @@ def find_match_iteration(
hard_match_columns = ['country', 'ecoregion', luc10, luc5, luc0]
assert len(hard_match_columns) == HARD_COLUMN_COUNT

# similar to the above, make the hard match columns contiguous float32 numpy arrays
m_dist_hard = np.ascontiguousarray(m_set[hard_match_columns].to_numpy()).astype(np.int32)
k_subset_dist_hard = np.ascontiguousarray(k_subset[hard_match_columns].to_numpy()).astype(np.int32)
# Methodology 6.5.5: S should be 10 times the size of K, in order to achieve this
# we select 10 pixels for each K.
# TODO: This assumes the methodolgy is being updated to 100% of K
required = 10

# Methodology 6.5.5: S should be 10 times the size of K, in order to achieve this for every
# pixel in the subsample (which is 10% the size of K) we select 100 pixels.
required = 100
# Find the unique categories in K
hard_match_category_values = [k[hard_match_columns].to_numpy() for _, k in k_set.iterrows()]
# Use a dictionary comprehension to find the unique values for the category columns
# and then convert that into a list
hard_match_categories = list({k.tobytes(): k for k in hard_match_category_values}.values())

logging.info("Running make_s_set_mask... required: %d", required)
starting_positions = rng.integers(0, int(m_dist_thresholded.shape[0]), int(k_subset_dist_thresholded.shape[0]))

s_set_mask_true, no_potentials = make_s_set_mask(
rng,
k_set,
m_set,
m_dist_thresholded,
k_subset_dist_thresholded,
m_dist_hard,
k_subset_dist_hard,
starting_positions,
required
hard_match_columns,
required,
hard_match_categories
)

logging.info("Done make_s_set_mask. s_set_mask.shape: %a", {s_set_mask_true.shape})
Expand Down Expand Up @@ -173,56 +177,60 @@ def find_match_iteration(

logging.info("Finished find match iteration")

@jit(nopython=True, fastmath=True, error_model="numpy")
def make_s_set_mask(
rng: np.random.Generator,
k_set: pd.DataFrame,
m_set: pd.DataFrame,
m_dist_thresholded: np.ndarray,
k_subset_dist_thresholded: np.ndarray,
hard_match_columns: list,
required: int,
hard_match_categories: list[np.ndarray]
):
s_set_mask_true = np.zeros(m_set.shape[0], dtype=np.bool_)
no_potentials = np.zeros(k_set.shape[0], dtype=np.bool_)

# Split K and M into those categories and create masks
for values in hard_match_categories:
k_selector = np.all(k_set[hard_match_columns] == values, axis=1)
m_selector = np.all(m_set[hard_match_columns] == values, axis=1)
logging.info(" category: %a |K|: %d |M|: %d", values, k_selector.sum(), m_selector.sum())
# Make masks for each of those pairs
key_s_set_mask_true, key_no_potentials = make_s_set_mask_rumba_inner(
m_dist_thresholded[m_selector],
k_subset_dist_thresholded[k_selector],
required,
rng
)
# Merge into one s_set_mask_true
s_set_mask_true[m_selector] = key_s_set_mask_true
# Merge into no_potentials
no_potentials[k_selector] = key_no_potentials
return s_set_mask_true,no_potentials

def make_s_set_mask_rumba_inner(
m_dist_thresholded: np.ndarray,
k_subset_dist_thresholded: np.ndarray,
m_dist_hard: np.ndarray,
k_subset_dist_hard: np.ndarray,
starting_positions: np.ndarray,
required: int
k_set_dist_thresholded: np.ndarray,
required: int,
rng: np.random.Generator
):
k_size = k_set_dist_thresholded.shape[0]
m_size = m_dist_thresholded.shape[0]
k_size = k_subset_dist_thresholded.shape[0]

s_include = np.zeros(m_size, dtype=np.bool_)
k_miss = np.zeros(k_size, dtype=np.bool_)

for k in range(k_size):
matches = 0
k_row = k_subset_dist_thresholded[k, :]
k_hard = k_subset_dist_hard[k]

for index in range(m_size):
m_index = (index + starting_positions[k]) % m_size

m_row = m_dist_thresholded[m_index, :]
m_hard = m_dist_hard[m_index]
m_tree = make_kdrangetree(m_dist_thresholded, np.ones(m_dist_thresholded.shape[1]))

should_include = True
rumba_tree = make_rumba_tree(m_tree, m_dist_thresholded)

# check that every element of m_hard matches k_hard
hard_equals = True
for j in range(m_hard.shape[0]):
if m_hard[j] != k_hard[j]:
hard_equals = False

if not hard_equals:
should_include = False
else:
for j in range(m_row.shape[0]):
if abs(m_row[j] - k_row[j]) > 1.0:
should_include = False

if should_include:
s_include[m_index] = True
matches += 1

# Don't find any more M's
if matches == required:
break

k_miss[k] = matches == 0
for k in range(k_size):
k_row = k_set_dist_thresholded[k]
possible_s = rumba_tree.members_sample(k_row, required, rng)
if len(possible_s) == 0:
k_miss[k] = True
else:
s_include[possible_s] = True

return s_include, k_miss

Expand Down
Loading