hubmapconsortium · yuanzhou · Apr 10, 2026 · Apr 13, 2026 · Apr 20, 2026 · Apr 21, 2026
diff --git a/entity-api-spec.yaml b/entity-api-spec.yaml
@@ -3182,4 +3182,88 @@ paths:
                       hubmap_id: "HBM987.WXYZ.321"
                       uuid: "abcd1234-ef56-gh78-ij90-klmnop123456"
         '500': 
-          description: Internal Error
+          description: Internal Error
+  '/entities/{id}/reindex-info':
+    get:
+      summary: Retrieve all data required to reindex a given entity
+      parameters:
+        - name: id
+          in: path
+          description: The unique identifier of entity. This identifier can be either an HuBMAP ID (e.g. HBM123.ABCD.456) or UUID
+          required: true
+          schema:
+            type: string
+      responses:
+        '200':
+          description: Entity and its related data including ancestors, descendants, immediate relationships, and optional donor and sample data
+          content:
+            application/json:
+              schema:
+                type: object
+              examples:
+                reindexinfoexample:
+                  summary: 'An example of a reindex-info response'
+                  value:
+                    entity: object
+                    ancestors: [entity objects]
+                    descendants: [entity objects]
+                    immediate_ancestors: [entity objects]
+                    immediate_descendants: [entity objects]
+                    donor: object
+                    origin_samples: [entity objects]
+                    source_samples: [entity objects]
+        '400':
+          description: Invalid or misformatted entity identifier
+        '401':
+          description: The user's token has expired or the user did not supply a valid token
+        '403':
+          description: The user is not authorized to access the entity
+        '404':
+          description: The target entity could not be found
+        '500':
+          description: An unexpected error occurred
+  '/entities/{id}/dataset-documents':
+    get:
+      summary: Retrieve processed dataset documents associated with a collection, epicollection, or upload
+      parameters:
+        - name: id
+          in: path
+          description: The unique identifier of entity. This identifier can be either an HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of a Collection, Epicollection, or Upload
+          required: true
+          schema:
+            type: string
+      responses:
+        '200':
+          description: A JSON object mapping dataset UUIDs to their processed document representations enriched via the trigger pipeline and normalized for response
+          content:
+            application/json:
+              schema:
+                type: object
+                additionalProperties:
+                  type: object
+              examples:
+                datasetdocumentsexample:
+                  summary: 'An example of a dataset-documents response'
+                  value:
+                    abcd1234-ef56-gh78-ij90-klmnop123456:
+                      entity_type: 'Dataset'
+                      uuid: 'abcd1234-ef56-gh78-ij90-klmnop123456'
+                      hubmap_id: 'HBM123.ABCD.456'
+                      status: 'Published'
+                      data_types: ['CODEX']
+                    wxyz5678-ab12-cd34-ef56-ghijklmnopqr:
+                      entity_type: 'Dataset'
+                      uuid: 'wxyz5678-ab12-cd34-ef56-ghijklmnopqr'
+                      hubmap_id: 'HBM456.WXYZ.789'
+                      status: 'Published'
+                      data_types: ['MxIF']
+        '400':
+          description: Invalid or misformatted entity identifier
+        '401':
+          description: The user's token has expired or the user did not supply a valid token
+        '403':
+          description: The user is not authorized to access the entity
+        '404':
+          description: The target entity could not be found
+        '500':
+          description: An unexpected error occurred
diff --git a/src/app.py b/src/app.py
@@ -941,6 +941,118 @@ def get_document_by_id(id):
     result_dict = _get_metadata_by_id(entity_id=id, metadata_scope=MetadataScopeEnum.INDEX)
     return jsonify(result_dict)
 
+"""
+Retrieve all data required to reindex a given entity
+
+Parameters
+----------
+uuid : str
+    The HuBMAP ID or UUID of the target entity
+
+Returns
+-------
+json
+    Entity and its related data (ancestors,
+    descendants, immediate relationships, and optional donor and sample data)
+"""
+@app.route('/entities/<uuid>/reindex-info', methods=['GET'])
+def get_reindex_info(uuid):
+    validate_token_if_auth_header_exists(request)
+    token = get_internal_token()
+
+    raw = app_neo4j_queries.get_reindex_info_raw(neo4j_driver_instance, uuid)
+    if raw is None:
+        return not_found_error(f"Entity {uuid} not found")
+
+    def run_triggers(entity_dict):
+        try:
+            generated = schema_manager.generate_triggered_data(
+                trigger_type=TriggerTypeEnum.ON_INDEX,
+                normalized_class=entity_dict['entity_type'],
+                request_args=request.args,
+                user_token=token,
+                existing_data_dict=entity_dict,
+                new_data_dict={},
+                properties_to_skip=[]
+            )
+            complete = schema_manager.remove_none_values({**entity_dict, **generated})
+            return schema_manager.normalize_document_result_for_response(entity_dict=complete)
+        except Exception as e:
+            logger.error(f"Trigger pipeline failed for {entity_dict.get('uuid')}: {e}")
+            return entity_dict
+    # There can be a ton of overlap between the multiple values we're fetching, so we don't want to run the triggers on the same data more 
+    # than once. So a simple cache is useful here.
+    triggered_cache = {}
+
+    def run_triggers_cached(entity_dict):
+        uid = entity_dict.get('uuid')
+        if uid not in triggered_cache:
+            triggered_cache[uid] = run_triggers(entity_dict)
+        return triggered_cache[uid]
+
+    result = {
+        "entity": run_triggers(raw["entity"]),
+        "ancestors": [run_triggers_cached(e) for e in raw["ancestors"]],
+        "descendants": [run_triggers_cached(e) for e in raw["descendants"]],
+        "immediate_ancestors": [run_triggers_cached(e) for e in raw["immediate_ancestors"]],
+        "immediate_descendants": [run_triggers_cached(e) for e in raw["immediate_descendants"]],
+    }
+    if raw.get("donor"):
+        result["donor"] = run_triggers_cached(raw["donor"])
+    if raw.get("donors"):
+        result["donors"] = [run_triggers_cached(e) for e in raw["donors"]]
+    if raw.get("origin_samples"):
+        result["origin_samples"] = [run_triggers_cached(e) for e in raw["origin_samples"]]
+    if raw.get("source_samples"):
+        result["source_samples"] = [run_triggers_cached(e) for e in raw["source_samples"]]
+    resp_body = json.dumps(result).encode('utf-8')
+    try_resp = try_stash_response_body(resp_body)
+    if try_resp is not None:
+        return try_resp
+    return jsonify(result)
+
+"""
+Retrieve processed dataset documents associated with a collection or upload
+
+Parameters
+----------
+uuid : str
+    The UUID of the target entity (Collection, Epicollection, or Upload)
+
+Returns
+-------
+json
+    A JSON object mapping dataset UUIDs to their processed document representations.
+    Each dataset is enriched via the trigger pipeline (ON_INDEX), normalized for response,
+    and stripped of selected large or unnecessary fields (e.g., ingest_metadata, metadata, files).
+    Returns a 404 error if the entity is not found.
+"""
+@app.route('/entities/<uuid>/dataset-documents', methods=['GET'])
+def get_dataset_documents(uuid):
+    validate_token_if_auth_header_exists(request)
+    token = get_internal_token()
+
+    entity_record = app_neo4j_queries.get_dataset_documents_raw(neo4j_driver_instance, uuid)
+    if entity_record is None:
+        return not_found_error(f"Entity {uuid} not found")
+
+    result = {}
+    for dataset_uuid, entity_dict in entity_record.items():
+        try:
+            complete = schema_manager.remove_none_values({**entity_dict})
+            final = schema_manager.normalize_document_result_for_response(entity_dict=complete)
+            for field in ['ingest_metadata', 'metadata', 'files']:
+                final.pop(field, None)
+            result[dataset_uuid] = final
+        except Exception as e:
+            logger.error(f"Failed to process document for {dataset_uuid}: {e}")
+            continue
+
+    resp_body = json.dumps(result).encode('utf-8')
+    try_resp = try_stash_response_body(resp_body)
+    if try_resp is not None:
+        return try_resp
+    return jsonify(result)
 
 """
 Retrive the full tree above the referenced entity and build the provenance document

diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py
@@ -158,7 +158,165 @@ def get_ancestor_organs(neo4j_driver, entity_uuid):
 
     return results
 
+"""
+Retrieve raw Neo4j node properties required for reindexing
+
+Parameters
+----------
+neo4j_driver : neo4j.Driver object
+    The Neo4j database connection pool
+entity_uuid : str
+    The HuBMAP ID or UUID of the target entity
+
+Returns
+-------
+dict
+    A dictionary of node properties matching the structure of get_entity(),
+    including related entities such as ancestors, descendants, donor,
+    origin_samples, source_samples, and immediate relationships
+"""
+def get_reindex_info_raw(neo4j_driver, uuid):
+
+    with neo4j_driver.session() as session:
+
+        # Target entity
+        entity_record = session.run("""
+            MATCH (e:Entity {uuid: $uuid})
+            RETURN properties(e) AS entity
+        """, uuid=uuid).single()
+        if not entity_record:
+            return None
+        entity = dict(entity_record["entity"])
+        entity_type = entity["entity_type"]
+
+        ancestors_record = session.run("""
+            MATCH (e:Entity {uuid: $uuid})
+            OPTIONAL MATCH (e)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(a:Entity)
+            WHERE a.entity_type <> 'Lab'
+            RETURN apoc.coll.toSet(COLLECT(properties(a))) AS ancestors
+        """, uuid=uuid).single()
+        ancestors = [dict(a) for a in (ancestors_record["ancestors"] or [])]
+
+        descendants_record = session.run("""
+            MATCH (e:Entity {uuid: $uuid})
+            OPTIONAL MATCH (e)-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]->(d:Entity)
+            RETURN apoc.coll.toSet(COLLECT(properties(d))) AS descendants
+        """, uuid=uuid).single()
+        descendants = [dict(d) for d in (descendants_record["descendants"] or [])]
+
+        immediate_record = session.run("""
+            MATCH (e:Entity {uuid: $uuid})
+            OPTIONAL MATCH (e)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(parent:Entity)
+            WHERE parent.entity_type <> 'Lab'
+            WITH e, apoc.coll.toSet(COLLECT(properties(parent))) AS immediate_ancestors
+            OPTIONAL MATCH (e)-[:ACTIVITY_INPUT]->(:Activity)-[:ACTIVITY_OUTPUT]->(child:Entity)
+            RETURN immediate_ancestors,
+                   apoc.coll.toSet(COLLECT(properties(child))) AS immediate_descendants
+        """, uuid=uuid).single()
+        immediate_ancestors = [dict(p) for p in (immediate_record["immediate_ancestors"] or [])]
+        immediate_descendants = [dict(c) for c in (immediate_record["immediate_descendants"] or [])]
+
+        result = {
+            "entity": entity,
+            "ancestors": ancestors,
+            "descendants": descendants,
+            "immediate_ancestors": immediate_ancestors,
+            "immediate_descendants": immediate_descendants,
+        }
+
+        if entity_type.lower() in ['sample', 'dataset', 'publication']:
+            donors_record = session.run("""
+                MATCH (e:Entity {uuid: $uuid})<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Donor)
+                RETURN apoc.coll.toSet(COLLECT(properties(d))) AS donors
+            """, uuid=uuid).single()
+            donors = [dict(d) for d in (donors_record["donors"] or [])]
+            donor = donors[0] if donors else None
+            if donors is not None:
+                result['donors'] = donors
+            if donor is not None:
+                result['donor'] = donor
+
+            origin_samples_record = session.run("""
+                MATCH (e:Entity {uuid: $uuid})
+                OPTIONAL MATCH (e)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Entity)
+                WHERE s.entity_type = 'Sample'
+                AND s.sample_category IS NOT NULL
+                AND toLower(s.sample_category) = 'organ'
+                AND s.organ IS NOT NULL
+                AND trim(s.organ) <> ''
+                RETURN apoc.coll.toSet(COLLECT(properties(s))) AS origin_samples
+            """, uuid=uuid).single()
+            origin_samples = [dict(s) for s in (origin_samples_record["origin_samples"] or [])]
+
+            if (entity_type == 'Sample'
+                    and entity.get('sample_category', '').lower() == 'organ'
+                    and entity.get('organ', '').strip() != ''):
+                origin_samples = [entity]
+            if origin_samples is not None:
+                result['origin_samples'] = origin_samples
+
+        if entity_type.lower() in ['dataset', 'publication']:
+            source_record = session.run("""
+                MATCH (e:Entity {uuid: $uuid})
+                MATCH (e)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Entity {entity_type: 'Sample'})
+                WHERE NOT EXISTS {
+                    MATCH (s)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(closer:Entity {entity_type: 'Sample'})
+                    MATCH (closer)-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]->(e)
+                }
+                RETURN apoc.coll.toSet(COLLECT(properties(s))) AS source_samples
+            """, uuid=uuid).single()
+            result["source_samples"] = [dict(s) for s in (source_record["source_samples"] or [])]
+        return result
+
+"""
+Retrieve dataset documents associated with a collection or upload
 
+Parameters
+----------
+neo4j_driver : neo4j.Driver object
+    The Neo4j database connection pool
+uuid : str
+    The UUID of the target entity (Collection, Epicollection, or Upload)
+
+Returns
+-------
+dict
+    A dictionary mapping dataset UUIDs to their node properties for all datasets
+    directly linked to the given entity via the appropriate relationship
+    (IN_COLLECTION or IN_UPLOAD). Returns an empty dictionary if no datasets
+    are found, or None if the input UUID does not correspond to a supported
+    entity type.
+"""
+def get_dataset_documents_raw(neo4j_driver, uuid):
+    with neo4j_driver.session() as session:
+        entity_record = session.run("""
+            MATCH (e:Entity {uuid: $uuid})
+            RETURN e.entity_type AS entity_type
+        """, uuid=uuid).single()
+        if not entity_record:
+            return None
+        entity_type = entity_record["entity_type"]
+
+        if entity_type in ['Collection', 'Epicollection']:
+            relationship = 'IN_COLLECTION'
+            root_label = 'Collection'
+        elif entity_type == 'Upload':
+            relationship = 'IN_UPLOAD'
+            root_label = 'Upload'
+        else:
+            return None
+
+        record = session.run(f"""
+            MATCH (root:{root_label} {{uuid: $uuid}})<-[:{relationship}]-(d:Dataset)
+            RETURN apoc.map.fromPairs(COLLECT([d.uuid, properties(d)])) AS result
+        """, uuid=uuid).single()
+
+        if not record or not record["result"]:
+            return {}
+
+        return {uuid: dict(props) for uuid, props in record["result"].items()}
+
+
 """
 Create multiple sample nodes in neo4j