From c17686d8fad9fe90bb2b814a7aa3e5763061979b Mon Sep 17 00:00:00 2001 From: DerekFurstPitt Date: Fri, 10 Apr 2026 11:24:01 -0400 Subject: [PATCH 1/5] WIP: Added get_reindex_info to handle all of the processing needed for a single reindex. Still need to do handling for s3 redirect for large payloads --- src/app.py | 63 +++++++++++++++++++++++ src/app_neo4j_queries.py | 108 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 171 insertions(+) diff --git a/src/app.py b/src/app.py index ab832ddc..b437f000 100644 --- a/src/app.py +++ b/src/app.py @@ -941,6 +941,69 @@ def get_document_by_id(id): result_dict = _get_metadata_by_id(entity_id=id, metadata_scope=MetadataScopeEnum.INDEX) return jsonify(result_dict) +""" +Retrieve all data required to reindex a given entity + +Parameters +---------- +uuid : str + The HuBMAP ID or UUID of the target entity + +Returns +------- +json + Entity and its related data (ancestors, + descendants, immediate relationships, and optional donor and sample data) +""" +@app.route('/entities//reindex-info', methods=['GET']) +def get_reindex_info(uuid): + validate_token_if_auth_header_exists(request) + token = get_internal_token() + + raw = app_neo4j_queries.get_reindex_info_raw(neo4j_driver_instance, uuid) + if raw is None: + return not_found_error(f"Entity {uuid} not found") + + def run_triggers(entity_dict): + try: + generated = schema_manager.generate_triggered_data( + trigger_type=TriggerTypeEnum.ON_INDEX, + normalized_class=entity_dict['entity_type'], + request_args=request.args, + user_token=token, + existing_data_dict=entity_dict, + new_data_dict={}, + properties_to_skip=[] + ) + complete = schema_manager.remove_none_values({**entity_dict, **generated}) + return schema_manager.normalize_document_result_for_response(entity_dict=complete) + except Exception as e: + logger.error(f"Trigger pipeline failed for {entity_dict.get('uuid')}: {e}") + return entity_dict + + triggered_cache = {} + + def run_triggers_cached(entity_dict): + uid = entity_dict.get('uuid') + if uid not in triggered_cache: + triggered_cache[uid] = run_triggers(entity_dict) + return triggered_cache[uid] + + result = { + "entity": run_triggers(raw["entity"]), + "ancestors": [run_triggers_cached(e) for e in raw["ancestors"]], + "descendants": [run_triggers_cached(e) for e in raw["descendants"]], + "immediate_ancestors": [run_triggers_cached(e) for e in raw["immediate_ancestors"]], + "immediate_descendants": [run_triggers_cached(e) for e in raw["immediate_descendants"]], + } + if raw.get("donor"): + result["donor"] = raw.get("donor") + if raw.get("origin_samples"): + result["origin_samples"] = raw.get("origin_samples") + if raw.get("source_samples"): + result["source_samples"] = raw.get("source_samples") + return jsonify(result) + """ Retrive the full tree above the referenced entity and build the provenance document diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py index 2975c908..9ff7ea65 100644 --- a/src/app_neo4j_queries.py +++ b/src/app_neo4j_queries.py @@ -158,6 +158,114 @@ def get_ancestor_organs(neo4j_driver, entity_uuid): return results +""" +Retrieve raw Neo4j node properties required for reindexing + +Parameters +---------- +neo4j_driver : neo4j.Driver object + The Neo4j database connection pool +entity_uuid : str + The HuBMAP ID or UUID of the target entity + +Returns +------- +dict + A dictionary of node properties matching the structure of get_entity(), + including related entities such as ancestors, descendants, donor, + origin_samples, source_samples, and immediate relationships +""" +def get_reindex_info_raw(neo4j_driver, uuid): + + with neo4j_driver.session() as session: + + # Target entity + entity_record = session.run(""" + MATCH (e:Entity {uuid: $uuid}) + RETURN properties(e) AS entity + """, uuid=uuid).single() + if not entity_record: + return None + entity = dict(entity_record["entity"]) + entity_type = entity["entity_type"] + + ancestors_record = session.run(""" + MATCH (e:Entity {uuid: $uuid}) + OPTIONAL MATCH (e)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(a:Entity) + WHERE a.entity_type <> 'Lab' + RETURN apoc.coll.toSet(COLLECT(properties(a))) AS ancestors + """, uuid=uuid).single() + ancestors = [dict(a) for a in (ancestors_record["ancestors"] or [])] + + descendants_record = session.run(""" + MATCH (e:Entity {uuid: $uuid}) + OPTIONAL MATCH (e)-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]->(d:Entity) + RETURN apoc.coll.toSet(COLLECT(properties(d))) AS descendants + """, uuid=uuid).single() + descendants = [dict(d) for d in (descendants_record["descendants"] or [])] + + immediate_record = session.run(""" + MATCH (e:Entity {uuid: $uuid}) + OPTIONAL MATCH (e)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(parent:Entity) + WHERE a.entity_type <> 'Lab' + WITH e, apoc.coll.toSet(COLLECT(properties(parent))) AS immediate_ancestors + OPTIONAL MATCH (e)-[:ACTIVITY_INPUT]->(:Activity)-[:ACTIVITY_OUTPUT]->(child:Entity) + RETURN immediate_ancestors, + apoc.coll.toSet(COLLECT(properties(child))) AS immediate_descendants + """, uuid=uuid).single() + immediate_ancestors = [dict(p) for p in (immediate_record["immediate_ancestors"] or [])] + immediate_descendants = [dict(c) for c in (immediate_record["immediate_descendants"] or [])] + + result = { + "entity": entity, + "ancestors": ancestors, + "descendants": descendants, + "immediate_ancestors": immediate_ancestors, + "immediate_descendants": immediate_descendants, + } + + if entity_type.lower() in ['sample', 'dataset', 'publication']: + donor_record = session.run(""" + MATCH (e:Entity {uuid: $uuid}) + MATCH (e)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Entity {entity_type: 'Donor'}) + RETURN properties(d) AS donor + LIMIT 1 + """, uuid=uuid).single() + donor = dict(donor_record["donor"]) if donor_record and donor_record["donor"] else None + if donor is not None: + result['donor'] = donor + + origin_samples_record = session.run(""" + MATCH (e:Entity {uuid: $uuid}) + OPTIONAL MATCH (e)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Entity) + WHERE s.entity_type = 'Sample' + AND s.sample_category IS NOT NULL + AND toLower(s.sample_category) = 'organ' + AND s.organ IS NOT NULL + AND trim(s.organ) <> '' + RETURN apoc.coll.toSet(COLLECT(properties(s))) AS origin_samples + """, uuid=uuid).single() + origin_samples = [dict(s) for s in (origin_samples_record["origin_samples"] or [])] + + if (entity_type == 'Sample' + and entity.get('sample_category', '').lower() == 'organ' + and entity.get('organ', '').strip() != ''): + origin_samples = [entity] + if origin_samples is not None: + result['origin_samples'] = origin_samples + + if entity_type.lower() in ['dataset', 'publication']: + source_record = session.run(""" + MATCH (e:Entity {uuid: $uuid}) + MATCH (e)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Entity {entity_type: 'Sample'}) + WHERE NOT EXISTS { + MATCH (s)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(closer:Entity {entity_type: 'Sample'}) + MATCH (closer)-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]->(e) + } + RETURN apoc.coll.toSet(COLLECT(properties(s))) AS source_samples + """, uuid=uuid).single() + result["source_samples"] = [dict(s) for s in (source_record["source_samples"] or [])] + return result """ Create multiple sample nodes in neo4j From f81eabdf5075be10b7cf213db0d5a97da09e04fe Mon Sep 17 00:00:00 2001 From: DerekFurstPitt Date: Mon, 13 Apr 2026 00:58:53 -0400 Subject: [PATCH 2/5] added accompanying endpoints to batch get info during the reindex processing to improve speed --- entity-api-spec.yaml | 86 +++++++++++++++++++++++++++++++++++++++- src/app.py | 64 ++++++++++++++++++++++++++++-- src/app_neo4j_queries.py | 51 +++++++++++++++++++++++- 3 files changed, 195 insertions(+), 6 deletions(-) diff --git a/entity-api-spec.yaml b/entity-api-spec.yaml index 8641138b..2ece47fa 100644 --- a/entity-api-spec.yaml +++ b/entity-api-spec.yaml @@ -3182,4 +3182,88 @@ paths: hubmap_id: "HBM987.WXYZ.321" uuid: "abcd1234-ef56-gh78-ij90-klmnop123456" '500': - description: Internal Error \ No newline at end of file + description: Internal Error + '/entities/{id}/reindex-info': + get: + summary: Retrieve all data required to reindex a given entity + parameters: + - name: id + in: path + description: The unique identifier of entity. This identifier can be either an HuBMAP ID (e.g. HBM123.ABCD.456) or UUID + required: true + schema: + type: string + responses: + '200': + description: Entity and its related data including ancestors, descendants, immediate relationships, and optional donor and sample data + content: + application/json: + schema: + type: object + examples: + reindexinfoexample: + summary: 'An example of a reindex-info response' + value: + entity: object + ancestors: [entity objects] + descendants: [entity objects] + immediate_ancestors: [entity objects] + immediate_descendants: [entity objects] + donor: object + origin_samples: [entity objects] + source_samples: [entity objects] + '400': + description: Invalid or misformatted entity identifier + '401': + description: The user's token has expired or the user did not supply a valid token + '403': + description: The user is not authorized to access the entity + '404': + description: The target entity could not be found + '500': + description: An unexpected error occurred + '/entities/{id}/dataset-documents': + get: + summary: Retrieve processed dataset documents associated with a collection, epicollection, or upload + parameters: + - name: id + in: path + description: The unique identifier of entity. This identifier can be either an HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of a Collection, Epicollection, or Upload + required: true + schema: + type: string + responses: + '200': + description: A JSON object mapping dataset UUIDs to their processed document representations enriched via the trigger pipeline and normalized for response + content: + application/json: + schema: + type: object + additionalProperties: + type: object + examples: + datasetdocumentsexample: + summary: 'An example of a dataset-documents response' + value: + abcd1234-ef56-gh78-ij90-klmnop123456: + entity_type: 'Dataset' + uuid: 'abcd1234-ef56-gh78-ij90-klmnop123456' + hubmap_id: 'HBM123.ABCD.456' + status: 'Published' + data_types: ['CODEX'] + wxyz5678-ab12-cd34-ef56-ghijklmnopqr: + entity_type: 'Dataset' + uuid: 'wxyz5678-ab12-cd34-ef56-ghijklmnopqr' + hubmap_id: 'HBM456.WXYZ.789' + status: 'Published' + data_types: ['MxIF'] + '400': + description: Invalid or misformatted entity identifier + '401': + description: The user's token has expired or the user did not supply a valid token + '403': + description: The user is not authorized to access the entity + '404': + description: The target entity could not be found + '500': + description: An unexpected error occurred \ No newline at end of file diff --git a/src/app.py b/src/app.py index b437f000..95919eed 100644 --- a/src/app.py +++ b/src/app.py @@ -980,7 +980,8 @@ def run_triggers(entity_dict): except Exception as e: logger.error(f"Trigger pipeline failed for {entity_dict.get('uuid')}: {e}") return entity_dict - + # There can be a ton of overlap between the multiple values we're fetching, so we don't want to run the triggers on the same data more + # than once. So a simple cache is useful here. triggered_cache = {} def run_triggers_cached(entity_dict): @@ -997,13 +998,68 @@ def run_triggers_cached(entity_dict): "immediate_descendants": [run_triggers_cached(e) for e in raw["immediate_descendants"]], } if raw.get("donor"): - result["donor"] = raw.get("donor") + result["donor"] = run_triggers_cached(raw["donor"]) if raw.get("origin_samples"): - result["origin_samples"] = raw.get("origin_samples") + result["origin_samples"] = [run_triggers_cached(e) for e in raw["origin_samples"]] if raw.get("source_samples"): - result["source_samples"] = raw.get("source_samples") + result["source_samples"] = [run_triggers_cached(e) for e in raw["source_samples"]] + resp_body = json.dumps(result).encode('utf-8') + try_resp = try_stash_response_body(resp_body) + if try_resp is not None: + return try_resp return jsonify(result) +""" +Retrieve processed dataset documents associated with a collection or upload + +Parameters +---------- +uuid : str + The UUID of the target entity (Collection, Epicollection, or Upload) + +Returns +------- +json + A JSON object mapping dataset UUIDs to their processed document representations. + Each dataset is enriched via the trigger pipeline (ON_INDEX), normalized for response, + and stripped of selected large or unnecessary fields (e.g., ingest_metadata, metadata, files). + Returns a 404 error if the entity is not found. +""" +@app.route('/entities//dataset-documents', methods=['GET']) +def get_dataset_documents(uuid): + validate_token_if_auth_header_exists(request) + token = get_internal_token() + + entity_record = app_neo4j_queries.get_dataset_documents_raw(neo4j_driver_instance, uuid) + if entity_record is None: + return not_found_error(f"Entity {uuid} not found") + + result = {} + for dataset_uuid, entity_dict in entity_record.items(): + try: + generated = schema_manager.generate_triggered_data( + trigger_type=TriggerTypeEnum.ON_INDEX, + normalized_class=entity_dict['entity_type'], + request_args=request.args, + user_token=token, + existing_data_dict=entity_dict, + new_data_dict={}, + properties_to_skip=["title"] + ) + complete = schema_manager.remove_none_values({**entity_dict, **generated}) + final = schema_manager.normalize_document_result_for_response(entity_dict=complete) + for field in ['ingest_metadata', 'metadata', 'files']: + final.pop(field, None) + result[dataset_uuid] = final + except Exception as e: + logger.error(f"Failed to process document for {dataset_uuid}: {e}") + continue + + resp_body = json.dumps(result).encode('utf-8') + try_resp = try_stash_response_body(resp_body) + if try_resp is not None: + return try_resp + return jsonify(result) """ Retrive the full tree above the referenced entity and build the provenance document diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py index 9ff7ea65..c56810b9 100644 --- a/src/app_neo4j_queries.py +++ b/src/app_neo4j_queries.py @@ -207,7 +207,7 @@ def get_reindex_info_raw(neo4j_driver, uuid): immediate_record = session.run(""" MATCH (e:Entity {uuid: $uuid}) OPTIONAL MATCH (e)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(parent:Entity) - WHERE a.entity_type <> 'Lab' + WHERE parent.entity_type <> 'Lab' WITH e, apoc.coll.toSet(COLLECT(properties(parent))) AS immediate_ancestors OPTIONAL MATCH (e)-[:ACTIVITY_INPUT]->(:Activity)-[:ACTIVITY_OUTPUT]->(child:Entity) RETURN immediate_ancestors, @@ -267,6 +267,55 @@ def get_reindex_info_raw(neo4j_driver, uuid): result["source_samples"] = [dict(s) for s in (source_record["source_samples"] or [])] return result +""" +Retrieve dataset documents associated with a collection or upload + +Parameters +---------- +neo4j_driver : neo4j.Driver object + The Neo4j database connection pool +uuid : str + The UUID of the target entity (Collection, Epicollection, or Upload) + +Returns +------- +dict + A dictionary mapping dataset UUIDs to their node properties for all datasets + directly linked to the given entity via the appropriate relationship + (IN_COLLECTION or IN_UPLOAD). Returns an empty dictionary if no datasets + are found, or None if the input UUID does not correspond to a supported + entity type. +""" +def get_dataset_documents_raw(neo4j_driver, uuid): + with neo4j_driver.session() as session: + entity_record = session.run(""" + MATCH (e:Entity {uuid: $uuid}) + RETURN e.entity_type AS entity_type + """, uuid=uuid).single() + if not entity_record: + return None + entity_type = entity_record["entity_type"] + + if entity_type in ['Collection', 'Epicollection']: + relationship = 'IN_COLLECTION' + root_label = 'Collection' + elif entity_type == 'Upload': + relationship = 'IN_UPLOAD' + root_label = 'Upload' + else: + return None + + record = session.run(f""" + MATCH (root:{root_label} {{uuid: $uuid}})<-[:{relationship}]-(d:Dataset) + RETURN apoc.map.fromPairs(COLLECT([d.uuid, properties(d)])) AS result + """, uuid=uuid).single() + + if not record or not record["result"]: + return {} + + return {uuid: dict(props) for uuid, props in record["result"].items()} + + """ Create multiple sample nodes in neo4j From 3ec20197d0a9855eb0aa566c8651283a45d417da Mon Sep 17 00:00:00 2001 From: DerekFurstPitt Date: Mon, 20 Apr 2026 14:29:05 -0400 Subject: [PATCH 3/5] removed triggered values from get_dataset_documents --- src/app.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/app.py b/src/app.py index 95919eed..a50890e9 100644 --- a/src/app.py +++ b/src/app.py @@ -1037,16 +1037,7 @@ def get_dataset_documents(uuid): result = {} for dataset_uuid, entity_dict in entity_record.items(): try: - generated = schema_manager.generate_triggered_data( - trigger_type=TriggerTypeEnum.ON_INDEX, - normalized_class=entity_dict['entity_type'], - request_args=request.args, - user_token=token, - existing_data_dict=entity_dict, - new_data_dict={}, - properties_to_skip=["title"] - ) - complete = schema_manager.remove_none_values({**entity_dict, **generated}) + complete = schema_manager.remove_none_values({**entity_dict}) final = schema_manager.normalize_document_result_for_response(entity_dict=complete) for field in ['ingest_metadata', 'metadata', 'files']: final.pop(field, None) From cf72d5e6423178d275aa93db3d74eac0fa613380 Mon Sep 17 00:00:00 2001 From: DerekFurstPitt Date: Tue, 21 Apr 2026 00:37:08 -0400 Subject: [PATCH 4/5] wip: added donors --- src/app.py | 2 ++ src/app_neo4j_queries.py | 13 +++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/app.py b/src/app.py index a50890e9..c699c760 100644 --- a/src/app.py +++ b/src/app.py @@ -999,6 +999,8 @@ def run_triggers_cached(entity_dict): } if raw.get("donor"): result["donor"] = run_triggers_cached(raw["donor"]) + if raw.get("donors"): + result["donors"] = [run_triggers_cached(e) for e in raw["donors"]] if raw.get("origin_samples"): result["origin_samples"] = [run_triggers_cached(e) for e in raw["origin_samples"]] if raw.get("source_samples"): diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py index c56810b9..9a066363 100644 --- a/src/app_neo4j_queries.py +++ b/src/app_neo4j_queries.py @@ -225,13 +225,14 @@ def get_reindex_info_raw(neo4j_driver, uuid): } if entity_type.lower() in ['sample', 'dataset', 'publication']: - donor_record = session.run(""" - MATCH (e:Entity {uuid: $uuid}) - MATCH (e)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Entity {entity_type: 'Donor'}) - RETURN properties(d) AS donor - LIMIT 1 + donors_record = session.run(""" + MATCH (e:Entity {uuid: $uuid})<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Donor) + RETURN apoc.coll.toSet(COLLECT(properties(d))) AS donors """, uuid=uuid).single() - donor = dict(donor_record["donor"]) if donor_record and donor_record["donor"] else None + donors = [dict(d) for d in (donors_record["donors"] or [])] + donor = donors[0] + if donors is not None: + result['donors'] = donors if donor is not None: result['donor'] = donor From 3d02671c29c9e78b96539b204ae2916464aa69e5 Mon Sep 17 00:00:00 2001 From: DerekFurstPitt Date: Tue, 21 Apr 2026 01:25:08 -0400 Subject: [PATCH 5/5] added support for donors in the reindex-info --- src/app_neo4j_queries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py index 9a066363..b1ba2012 100644 --- a/src/app_neo4j_queries.py +++ b/src/app_neo4j_queries.py @@ -230,7 +230,7 @@ def get_reindex_info_raw(neo4j_driver, uuid): RETURN apoc.coll.toSet(COLLECT(properties(d))) AS donors """, uuid=uuid).single() donors = [dict(d) for d in (donors_record["donors"] or [])] - donor = donors[0] + donor = donors[0] if donors else None if donors is not None: result['donors'] = donors if donor is not None: