From c17686d8fad9fe90bb2b814a7aa3e5763061979b Mon Sep 17 00:00:00 2001
From: DerekFurstPitt <drf57@pitt.edu>
Date: Fri, 10 Apr 2026 11:24:01 -0400
Subject: [PATCH 1/5] WIP: Added get_reindex_info to handle all of the
 processing needed for a single reindex. Still need to do handling for s3
 redirect for large payloads

---
 src/app.py               |  63 +++++++++++++++++++++++
 src/app_neo4j_queries.py | 108 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 171 insertions(+)
diff --git a/src/app.py b/src/app.py
index ab832ddc..b437f000 100644
--- a/src/app.py
+++ b/src/app.py
@@ -941,6 +941,69 @@ def get_document_by_id(id):
     result_dict = _get_metadata_by_id(entity_id=id, metadata_scope=MetadataScopeEnum.INDEX)
     return jsonify(result_dict)
 
+"""
+Retrieve all data required to reindex a given entity
+
+Parameters
+----------
+uuid : str
+    The HuBMAP ID or UUID of the target entity
+
+Returns
+-------
+json
+    Entity and its related data (ancestors,
+    descendants, immediate relationships, and optional donor and sample data)
+"""
+@app.route('/entities/<uuid>/reindex-info', methods=['GET'])
+def get_reindex_info(uuid):
+    validate_token_if_auth_header_exists(request)
+    token = get_internal_token()
+
+    raw = app_neo4j_queries.get_reindex_info_raw(neo4j_driver_instance, uuid)
+    if raw is None:
+        return not_found_error(f"Entity {uuid} not found")
+
+    def run_triggers(entity_dict):
+        try:
+            generated = schema_manager.generate_triggered_data(
+                trigger_type=TriggerTypeEnum.ON_INDEX,
+                normalized_class=entity_dict['entity_type'],
+                request_args=request.args,
+                user_token=token,
+                existing_data_dict=entity_dict,
+                new_data_dict={},
+                properties_to_skip=[]
+            )
+            complete = schema_manager.remove_none_values({**entity_dict, **generated})
+            return schema_manager.normalize_document_result_for_response(entity_dict=complete)
+        except Exception as e:
+            logger.error(f"Trigger pipeline failed for {entity_dict.get('uuid')}: {e}")
+            return entity_dict
+        
+    triggered_cache = {}
+
+    def run_triggers_cached(entity_dict):
+        uid = entity_dict.get('uuid')
+        if uid not in triggered_cache:
+            triggered_cache[uid] = run_triggers(entity_dict)
+        return triggered_cache[uid]
+
+    result = {
+        "entity": run_triggers(raw["entity"]),
+        "ancestors": [run_triggers_cached(e) for e in raw["ancestors"]],
+        "descendants": [run_triggers_cached(e) for e in raw["descendants"]],
+        "immediate_ancestors": [run_triggers_cached(e) for e in raw["immediate_ancestors"]],
+        "immediate_descendants": [run_triggers_cached(e) for e in raw["immediate_descendants"]],
+    }
+    if raw.get("donor"):
+        result["donor"] = raw.get("donor")
+    if raw.get("origin_samples"):
+        result["origin_samples"] = raw.get("origin_samples")
+    if raw.get("source_samples"):
+        result["source_samples"] = raw.get("source_samples")
+    return jsonify(result)
+
 
 """
 Retrive the full tree above the referenced entity and build the provenance document
diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py
index 2975c908..9ff7ea65 100644
--- a/src/app_neo4j_queries.py
+++ b/src/app_neo4j_queries.py
@@ -158,6 +158,114 @@ def get_ancestor_organs(neo4j_driver, entity_uuid):
 
     return results
 
+"""
+Retrieve raw Neo4j node properties required for reindexing
+
+Parameters
+----------
+neo4j_driver : neo4j.Driver object
+    The Neo4j database connection pool
+entity_uuid : str
+    The HuBMAP ID or UUID of the target entity
+
+Returns
+-------
+dict
+    A dictionary of node properties matching the structure of get_entity(),
+    including related entities such as ancestors, descendants, donor,
+    origin_samples, source_samples, and immediate relationships
+"""
+def get_reindex_info_raw(neo4j_driver, uuid):
+
+    with neo4j_driver.session() as session:
+
+        # Target entity
+        entity_record = session.run("""
+            MATCH (e:Entity {uuid: $uuid})
+            RETURN properties(e) AS entity
+        """, uuid=uuid).single()
+        if not entity_record:
+            return None
+        entity = dict(entity_record["entity"])
+        entity_type = entity["entity_type"]
+
+        ancestors_record = session.run("""
+            MATCH (e:Entity {uuid: $uuid})
+            OPTIONAL MATCH (e)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(a:Entity)
+            WHERE a.entity_type <> 'Lab'
+            RETURN apoc.coll.toSet(COLLECT(properties(a))) AS ancestors
+        """, uuid=uuid).single()
+        ancestors = [dict(a) for a in (ancestors_record["ancestors"] or [])]
+
+        descendants_record = session.run("""
+            MATCH (e:Entity {uuid: $uuid})
+            OPTIONAL MATCH (e)-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]->(d:Entity)
+            RETURN apoc.coll.toSet(COLLECT(properties(d))) AS descendants
+        """, uuid=uuid).single()
+        descendants = [dict(d) for d in (descendants_record["descendants"] or [])]
+
+        immediate_record = session.run("""
+            MATCH (e:Entity {uuid: $uuid})
+            OPTIONAL MATCH (e)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(parent:Entity)
+            WHERE a.entity_type <> 'Lab'
+            WITH e, apoc.coll.toSet(COLLECT(properties(parent))) AS immediate_ancestors
+            OPTIONAL MATCH (e)-[:ACTIVITY_INPUT]->(:Activity)-[:ACTIVITY_OUTPUT]->(child:Entity)
+            RETURN immediate_ancestors,
+                   apoc.coll.toSet(COLLECT(properties(child))) AS immediate_descendants
+        """, uuid=uuid).single()
+        immediate_ancestors = [dict(p) for p in (immediate_record["immediate_ancestors"] or [])]
+        immediate_descendants = [dict(c) for c in (immediate_record["immediate_descendants"] or [])]
+        
+        result = {
+            "entity": entity,
+            "ancestors": ancestors,
+            "descendants": descendants,
+            "immediate_ancestors": immediate_ancestors,
+            "immediate_descendants": immediate_descendants,
+        }
+        
+        if entity_type.lower() in ['sample', 'dataset', 'publication']:
+            donor_record = session.run("""
+                MATCH (e:Entity {uuid: $uuid})
+                MATCH (e)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Entity {entity_type: 'Donor'})
+                RETURN properties(d) AS donor
+                LIMIT 1
+            """, uuid=uuid).single()
+            donor = dict(donor_record["donor"]) if donor_record and donor_record["donor"] else None
+            if donor is not None:
+                result['donor'] = donor
+            
+            origin_samples_record = session.run("""
+                MATCH (e:Entity {uuid: $uuid})
+                OPTIONAL MATCH (e)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Entity)
+                WHERE s.entity_type = 'Sample'
+                AND s.sample_category IS NOT NULL
+                AND toLower(s.sample_category) = 'organ'
+                AND s.organ IS NOT NULL
+                AND trim(s.organ) <> ''
+                RETURN apoc.coll.toSet(COLLECT(properties(s))) AS origin_samples
+            """, uuid=uuid).single()
+            origin_samples = [dict(s) for s in (origin_samples_record["origin_samples"] or [])]
+            
+            if (entity_type == 'Sample'
+                    and entity.get('sample_category', '').lower() == 'organ'
+                    and entity.get('organ', '').strip() != ''):
+                origin_samples = [entity]
+            if origin_samples is not None:
+                result['origin_samples'] = origin_samples
+
+        if entity_type.lower() in ['dataset', 'publication']:
+            source_record = session.run("""
+                MATCH (e:Entity {uuid: $uuid})
+                MATCH (e)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Entity {entity_type: 'Sample'})
+                WHERE NOT EXISTS {
+                    MATCH (s)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(closer:Entity {entity_type: 'Sample'})
+                    MATCH (closer)-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]->(e)
+                }
+                RETURN apoc.coll.toSet(COLLECT(properties(s))) AS source_samples
+            """, uuid=uuid).single()
+            result["source_samples"] = [dict(s) for s in (source_record["source_samples"] or [])]
+        return result
 
 """
 Create multiple sample nodes in neo4j

From f81eabdf5075be10b7cf213db0d5a97da09e04fe Mon Sep 17 00:00:00 2001
From: DerekFurstPitt <drf57@pitt.edu>
Date: Mon, 13 Apr 2026 00:58:53 -0400
Subject: [PATCH 2/5] added accompanying endpoints to batch get info during the
 reindex processing to improve speed

---
 entity-api-spec.yaml     | 86 +++++++++++++++++++++++++++++++++++++++-
 src/app.py               | 64 ++++++++++++++++++++++++++++--
 src/app_neo4j_queries.py | 51 +++++++++++++++++++++++-
 3 files changed, 195 insertions(+), 6 deletions(-)

diff --git a/entity-api-spec.yaml b/entity-api-spec.yaml
index 8641138b..2ece47fa 100644
--- a/entity-api-spec.yaml
+++ b/entity-api-spec.yaml
@@ -3182,4 +3182,88 @@ paths:
                       hubmap_id: "HBM987.WXYZ.321"
                       uuid: "abcd1234-ef56-gh78-ij90-klmnop123456"
         '500': 
-          description: Internal Error
\ No newline at end of file
+          description: Internal Error
+  '/entities/{id}/reindex-info':
+    get:
+      summary: Retrieve all data required to reindex a given entity
+      parameters:
+        - name: id
+          in: path
+          description: The unique identifier of entity. This identifier can be either an HuBMAP ID (e.g. HBM123.ABCD.456) or UUID
+          required: true
+          schema:
+            type: string
+      responses:
+        '200':
+          description: Entity and its related data including ancestors, descendants, immediate relationships, and optional donor and sample data
+          content:
+            application/json:
+              schema:
+                type: object
+              examples:
+                reindexinfoexample:
+                  summary: 'An example of a reindex-info response'
+                  value:
+                    entity: object
+                    ancestors: [entity objects]
+                    descendants: [entity objects]
+                    immediate_ancestors: [entity objects]
+                    immediate_descendants: [entity objects]
+                    donor: object
+                    origin_samples: [entity objects]
+                    source_samples: [entity objects]
+        '400':
+          description: Invalid or misformatted entity identifier
+        '401':
+          description: The user's token has expired or the user did not supply a valid token
+        '403':
+          description: The user is not authorized to access the entity
+        '404':
+          description: The target entity could not be found
+        '500':
+          description: An unexpected error occurred
+  '/entities/{id}/dataset-documents':
+    get:
+      summary: Retrieve processed dataset documents associated with a collection, epicollection, or upload
+      parameters:
+        - name: id
+          in: path
+          description: The unique identifier of entity. This identifier can be either an HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of a Collection, Epicollection, or Upload
+          required: true
+          schema:
+            type: string
+      responses:
+        '200':
+          description: A JSON object mapping dataset UUIDs to their processed document representations enriched via the trigger pipeline and normalized for response
+          content:
+            application/json:
+              schema:
+                type: object
+                additionalProperties:
+                  type: object
+              examples:
+                datasetdocumentsexample:
+                  summary: 'An example of a dataset-documents response'
+                  value:
+                    abcd1234-ef56-gh78-ij90-klmnop123456:
+                      entity_type: 'Dataset'
+                      uuid: 'abcd1234-ef56-gh78-ij90-klmnop123456'
+                      hubmap_id: 'HBM123.ABCD.456'
+                      status: 'Published'
+                      data_types: ['CODEX']
+                    wxyz5678-ab12-cd34-ef56-ghijklmnopqr:
+                      entity_type: 'Dataset'
+                      uuid: 'wxyz5678-ab12-cd34-ef56-ghijklmnopqr'
+                      hubmap_id: 'HBM456.WXYZ.789'
+                      status: 'Published'
+                      data_types: ['MxIF']
+        '400':
+          description: Invalid or misformatted entity identifier
+        '401':
+          description: The user's token has expired or the user did not supply a valid token
+        '403':
+          description: The user is not authorized to access the entity
+        '404':
+          description: The target entity could not be found
+        '500':
+          description: An unexpected error occurred
\ No newline at end of file
diff --git a/src/app.py b/src/app.py
index b437f000..95919eed 100644
--- a/src/app.py
+++ b/src/app.py
@@ -980,7 +980,8 @@ def run_triggers(entity_dict):
         except Exception as e:
             logger.error(f"Trigger pipeline failed for {entity_dict.get('uuid')}: {e}")
             return entity_dict
-        
+    # There can be a ton of overlap between the multiple values we're fetching, so we don't want to run the triggers on the same data more 
+    # than once. So a simple cache is useful here.
     triggered_cache = {}
 
     def run_triggers_cached(entity_dict):
@@ -997,13 +998,68 @@ def run_triggers_cached(entity_dict):
         "immediate_descendants": [run_triggers_cached(e) for e in raw["immediate_descendants"]],
     }
     if raw.get("donor"):
-        result["donor"] = raw.get("donor")
+        result["donor"] = run_triggers_cached(raw["donor"])
     if raw.get("origin_samples"):
-        result["origin_samples"] = raw.get("origin_samples")
+        result["origin_samples"] = [run_triggers_cached(e) for e in raw["origin_samples"]]
     if raw.get("source_samples"):
-        result["source_samples"] = raw.get("source_samples")
+        result["source_samples"] = [run_triggers_cached(e) for e in raw["source_samples"]]
+    resp_body = json.dumps(result).encode('utf-8')
+    try_resp = try_stash_response_body(resp_body)
+    if try_resp is not None:
+        return try_resp
     return jsonify(result)
 
+"""
+Retrieve processed dataset documents associated with a collection or upload
+
+Parameters
+----------
+uuid : str
+    The UUID of the target entity (Collection, Epicollection, or Upload)
+
+Returns
+-------
+json
+    A JSON object mapping dataset UUIDs to their processed document representations.
+    Each dataset is enriched via the trigger pipeline (ON_INDEX), normalized for response,
+    and stripped of selected large or unnecessary fields (e.g., ingest_metadata, metadata, files).
+    Returns a 404 error if the entity is not found.
+"""
+@app.route('/entities/<uuid>/dataset-documents', methods=['GET'])
+def get_dataset_documents(uuid):
+    validate_token_if_auth_header_exists(request)
+    token = get_internal_token()
+
+    entity_record = app_neo4j_queries.get_dataset_documents_raw(neo4j_driver_instance, uuid)
+    if entity_record is None:
+        return not_found_error(f"Entity {uuid} not found")
+
+    result = {}
+    for dataset_uuid, entity_dict in entity_record.items():
+        try:
+            generated = schema_manager.generate_triggered_data(
+                trigger_type=TriggerTypeEnum.ON_INDEX,
+                normalized_class=entity_dict['entity_type'],
+                request_args=request.args,
+                user_token=token,
+                existing_data_dict=entity_dict,
+                new_data_dict={},
+                properties_to_skip=["title"]
+            )
+            complete = schema_manager.remove_none_values({**entity_dict, **generated})
+            final = schema_manager.normalize_document_result_for_response(entity_dict=complete)
+            for field in ['ingest_metadata', 'metadata', 'files']:
+                final.pop(field, None)
+            result[dataset_uuid] = final
+        except Exception as e:
+            logger.error(f"Failed to process document for {dataset_uuid}: {e}")
+            continue
+
+    resp_body = json.dumps(result).encode('utf-8')
+    try_resp = try_stash_response_body(resp_body)
+    if try_resp is not None:
+        return try_resp
+    return jsonify(result)
 
 """
 Retrive the full tree above the referenced entity and build the provenance document
diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py
index 9ff7ea65..c56810b9 100644
--- a/src/app_neo4j_queries.py
+++ b/src/app_neo4j_queries.py
@@ -207,7 +207,7 @@ def get_reindex_info_raw(neo4j_driver, uuid):
         immediate_record = session.run("""
             MATCH (e:Entity {uuid: $uuid})
             OPTIONAL MATCH (e)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(parent:Entity)
-            WHERE a.entity_type <> 'Lab'
+            WHERE parent.entity_type <> 'Lab'
             WITH e, apoc.coll.toSet(COLLECT(properties(parent))) AS immediate_ancestors
             OPTIONAL MATCH (e)-[:ACTIVITY_INPUT]->(:Activity)-[:ACTIVITY_OUTPUT]->(child:Entity)
             RETURN immediate_ancestors,
@@ -267,6 +267,55 @@ def get_reindex_info_raw(neo4j_driver, uuid):
             result["source_samples"] = [dict(s) for s in (source_record["source_samples"] or [])]
         return result
 
+"""
+Retrieve dataset documents associated with a collection or upload
+
+Parameters
+----------
+neo4j_driver : neo4j.Driver object
+    The Neo4j database connection pool
+uuid : str
+    The UUID of the target entity (Collection, Epicollection, or Upload)
+
+Returns
+-------
+dict
+    A dictionary mapping dataset UUIDs to their node properties for all datasets
+    directly linked to the given entity via the appropriate relationship
+    (IN_COLLECTION or IN_UPLOAD). Returns an empty dictionary if no datasets
+    are found, or None if the input UUID does not correspond to a supported
+    entity type.
+"""
+def get_dataset_documents_raw(neo4j_driver, uuid):
+    with neo4j_driver.session() as session:
+        entity_record = session.run("""
+            MATCH (e:Entity {uuid: $uuid})
+            RETURN e.entity_type AS entity_type
+        """, uuid=uuid).single()
+        if not entity_record:
+            return None
+        entity_type = entity_record["entity_type"]
+
+        if entity_type in ['Collection', 'Epicollection']:
+            relationship = 'IN_COLLECTION'
+            root_label = 'Collection'
+        elif entity_type == 'Upload':
+            relationship = 'IN_UPLOAD'
+            root_label = 'Upload'
+        else:
+            return None
+
+        record = session.run(f"""
+            MATCH (root:{root_label} {{uuid: $uuid}})<-[:{relationship}]-(d:Dataset)
+            RETURN apoc.map.fromPairs(COLLECT([d.uuid, properties(d)])) AS result
+        """, uuid=uuid).single()
+
+        if not record or not record["result"]:
+            return {}
+
+        return {uuid: dict(props) for uuid, props in record["result"].items()}
+
+    
 """
 Create multiple sample nodes in neo4j
 

From 3ec20197d0a9855eb0aa566c8651283a45d417da Mon Sep 17 00:00:00 2001
From: DerekFurstPitt <drf57@pitt.edu>
Date: Mon, 20 Apr 2026 14:29:05 -0400
Subject: [PATCH 3/5] removed triggered values from get_dataset_documents

---
 src/app.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/app.py b/src/app.py
index 95919eed..a50890e9 100644
--- a/src/app.py
+++ b/src/app.py
@@ -1037,16 +1037,7 @@ def get_dataset_documents(uuid):
     result = {}
     for dataset_uuid, entity_dict in entity_record.items():
         try:
-            generated = schema_manager.generate_triggered_data(
-                trigger_type=TriggerTypeEnum.ON_INDEX,
-                normalized_class=entity_dict['entity_type'],
-                request_args=request.args,
-                user_token=token,
-                existing_data_dict=entity_dict,
-                new_data_dict={},
-                properties_to_skip=["title"]
-            )
-            complete = schema_manager.remove_none_values({**entity_dict, **generated})
+            complete = schema_manager.remove_none_values({**entity_dict})
             final = schema_manager.normalize_document_result_for_response(entity_dict=complete)
             for field in ['ingest_metadata', 'metadata', 'files']:
                 final.pop(field, None)

From cf72d5e6423178d275aa93db3d74eac0fa613380 Mon Sep 17 00:00:00 2001
From: DerekFurstPitt <drf57@pitt.edu>
Date: Tue, 21 Apr 2026 00:37:08 -0400
Subject: [PATCH 4/5] wip: added donors

---
 src/app.py               |  2 ++
 src/app_neo4j_queries.py | 13 +++++++------
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/app.py b/src/app.py
index a50890e9..c699c760 100644
--- a/src/app.py
+++ b/src/app.py
@@ -999,6 +999,8 @@ def run_triggers_cached(entity_dict):
     }
     if raw.get("donor"):
         result["donor"] = run_triggers_cached(raw["donor"])
+    if raw.get("donors"):
+        result["donors"] = [run_triggers_cached(e) for e in raw["donors"]]
     if raw.get("origin_samples"):
         result["origin_samples"] = [run_triggers_cached(e) for e in raw["origin_samples"]]
     if raw.get("source_samples"):
diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py
index c56810b9..9a066363 100644
--- a/src/app_neo4j_queries.py
+++ b/src/app_neo4j_queries.py
@@ -225,13 +225,14 @@ def get_reindex_info_raw(neo4j_driver, uuid):
         }
         
         if entity_type.lower() in ['sample', 'dataset', 'publication']:
-            donor_record = session.run("""
-                MATCH (e:Entity {uuid: $uuid})
-                MATCH (e)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Entity {entity_type: 'Donor'})
-                RETURN properties(d) AS donor
-                LIMIT 1
+            donors_record = session.run("""
+                MATCH (e:Entity {uuid: $uuid})<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Donor)
+                RETURN apoc.coll.toSet(COLLECT(properties(d))) AS donors
             """, uuid=uuid).single()
-            donor = dict(donor_record["donor"]) if donor_record and donor_record["donor"] else None
+            donors = [dict(d) for d in (donors_record["donors"] or [])]
+            donor = donors[0]
+            if donors is not None:
+                result['donors'] = donors
             if donor is not None:
                 result['donor'] = donor
             

From 3d02671c29c9e78b96539b204ae2916464aa69e5 Mon Sep 17 00:00:00 2001
From: DerekFurstPitt <drf57@pitt.edu>
Date: Tue, 21 Apr 2026 01:25:08 -0400
Subject: [PATCH 5/5] added support for donors in the reindex-info

---
 src/app_neo4j_queries.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py
index 9a066363..b1ba2012 100644
--- a/src/app_neo4j_queries.py
+++ b/src/app_neo4j_queries.py
@@ -230,7 +230,7 @@ def get_reindex_info_raw(neo4j_driver, uuid):
                 RETURN apoc.coll.toSet(COLLECT(properties(d))) AS donors
             """, uuid=uuid).single()
             donors = [dict(d) for d in (donors_record["donors"] or [])]
-            donor = donors[0]
+            donor = donors[0] if donors else None
             if donors is not None:
                 result['donors'] = donors
             if donor is not None: