quantifyearth · robinmessage · Mar 12, 2024 · Mar 15, 2024 · Apr 16, 2024 · mdales
diff --git a/methods/matching/find_pairs.py b/methods/matching/find_pairs.py
@@ -8,6 +8,7 @@
 import pandas as pd
 
 from methods.common.luc import luc_matching_columns
+from methods.utils.kd_tree import make_kdrangetree, make_rumba_tree
 
 REPEAT_MATCH_FINDING = 100
 DEFAULT_DISTANCE = 10000000.0
@@ -37,10 +38,8 @@ def find_match_iteration(
 
     # Methodology 6.5.7: For a 10% sample of K
     k_set = pd.read_parquet(k_parquet_filename)
-    k_subset = k_set.sample(
-        frac=0.1,
-        random_state=rng
-    ).reset_index()
+    # TODO: This assumes the methodolgy is being updated to 100% of K
+    k_subset = k_set
 
     logging.info("Loading M from %s", m_parquet_filename)
     m_set = pd.read_parquet(m_parquet_filename)
@@ -76,23 +75,28 @@ def find_match_iteration(
     hard_match_columns = ['country', 'ecoregion', luc10, luc5, luc0]
     assert len(hard_match_columns) == HARD_COLUMN_COUNT
 
-    # similar to the above, make the hard match columns contiguous float32 numpy arrays
-    m_dist_hard = np.ascontiguousarray(m_set[hard_match_columns].to_numpy()).astype(np.int32)
-    k_subset_dist_hard = np.ascontiguousarray(k_subset[hard_match_columns].to_numpy()).astype(np.int32)
+    # Methodology 6.5.5: S should be 10 times the size of K, in order to achieve this
+    # we select 10 pixels for each K.
+    # TODO: This assumes the methodolgy is being updated to 100% of K
+    required = 10
 
-    # Methodology 6.5.5: S should be 10 times the size of K, in order to achieve this for every
-    # pixel in the subsample (which is 10% the size of K) we select 100 pixels.
-    required = 100
+    # Find the unique categories in K
+    hard_match_category_values = [k[hard_match_columns].to_numpy() for _, k in k_set.iterrows()]
+    #   Use a dictionary comprehension to find the unique values for the category columns
+    #   and then convert that into a list
+    hard_match_categories = list({k.tobytes(): k for k in hard_match_category_values}.values())
 
     logging.info("Running make_s_set_mask... required: %d", required)
-    starting_positions = rng.integers(0, int(m_dist_thresholded.shape[0]), int(k_subset_dist_thresholded.shape[0]))
+
     s_set_mask_true, no_potentials = make_s_set_mask(
+        rng,
+        k_set,
+        m_set,
         m_dist_thresholded,
         k_subset_dist_thresholded,
-        m_dist_hard,
-        k_subset_dist_hard,
-        starting_positions,
-        required
+        hard_match_columns,
+        required,
+        hard_match_categories
     )
 
     logging.info("Done make_s_set_mask. s_set_mask.shape: %a", {s_set_mask_true.shape})
@@ -173,56 +177,60 @@ def find_match_iteration(
 
     logging.info("Finished find match iteration")
 
-@jit(nopython=True, fastmath=True, error_model="numpy")
 def make_s_set_mask(
+        rng: np.random.Generator,
+        k_set: pd.DataFrame,
+        m_set: pd.DataFrame,
+        m_dist_thresholded: np.ndarray,
+        k_subset_dist_thresholded: np.ndarray,
+        hard_match_columns: list,
+        required: int,
+        hard_match_categories: list[np.ndarray]
+    ):
+    s_set_mask_true = np.zeros(m_set.shape[0], dtype=np.bool_)
+    no_potentials = np.zeros(k_set.shape[0], dtype=np.bool_)
+
+    # Split K and M into those categories and create masks
+    for values in hard_match_categories:
+        k_selector = np.all(k_set[hard_match_columns] == values, axis=1)
+        m_selector = np.all(m_set[hard_match_columns] == values, axis=1)
+        logging.info("  category: %a |K|: %d |M|: %d", values, k_selector.sum(), m_selector.sum())
+        # Make masks for each of those pairs
+        key_s_set_mask_true, key_no_potentials = make_s_set_mask_rumba_inner(
+            m_dist_thresholded[m_selector],
+            k_subset_dist_thresholded[k_selector],
+            required,
+            rng
+        )
+        # Merge into one s_set_mask_true
+        s_set_mask_true[m_selector] = key_s_set_mask_true
+        # Merge into no_potentials
+        no_potentials[k_selector] = key_no_potentials
+    return s_set_mask_true,no_potentials
+
+def make_s_set_mask_rumba_inner(
     m_dist_thresholded: np.ndarray,
-    k_subset_dist_thresholded: np.ndarray,
-    m_dist_hard: np.ndarray,
-    k_subset_dist_hard: np.ndarray,
-    starting_positions: np.ndarray,
-    required: int
+    k_set_dist_thresholded: np.ndarray,
+    required: int,
+    rng: np.random.Generator
 ):
+    k_size = k_set_dist_thresholded.shape[0]
     m_size = m_dist_thresholded.shape[0]
-    k_size = k_subset_dist_thresholded.shape[0]
 
     s_include = np.zeros(m_size, dtype=np.bool_)
     k_miss = np.zeros(k_size, dtype=np.bool_)
 
-    for k in range(k_size):
-        matches = 0
-        k_row = k_subset_dist_thresholded[k, :]
-        k_hard = k_subset_dist_hard[k]
-
-        for index in range(m_size):
-            m_index = (index + starting_positions[k]) % m_size
-
-            m_row = m_dist_thresholded[m_index, :]
-            m_hard = m_dist_hard[m_index]
+    m_tree = make_kdrangetree(m_dist_thresholded, np.ones(m_dist_thresholded.shape[1]))
 
-            should_include = True
+    rumba_tree = make_rumba_tree(m_tree, m_dist_thresholded)
 
-            # check that every element of m_hard matches k_hard
-            hard_equals = True
-            for j in range(m_hard.shape[0]):
-                if m_hard[j] != k_hard[j]:
-                    hard_equals = False
-
-            if not hard_equals:
-                should_include = False
-            else:
-                for j in range(m_row.shape[0]):
-                    if abs(m_row[j] - k_row[j]) > 1.0:
-                        should_include = False
-
-            if should_include:
-                s_include[m_index] = True
-                matches += 1
-
-            # Don't find any more M's
-            if matches == required:
-                break
-
-        k_miss[k] = matches == 0
+    for k in range(k_size):
+        k_row =  k_set_dist_thresholded[k]
+        possible_s = rumba_tree.members_sample(k_row, required, rng)
+        if len(possible_s) == 0:
+            k_miss[k] = True
+        else:
+            s_include[possible_s] = True
 
     return s_include, k_miss