Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 85 additions & 1 deletion entity-api-spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3182,4 +3182,88 @@ paths:
hubmap_id: "HBM987.WXYZ.321"
uuid: "abcd1234-ef56-gh78-ij90-klmnop123456"
'500':
description: Internal Error
description: Internal Error
'/entities/{id}/reindex-info':
get:
summary: Retrieve all data required to reindex a given entity
parameters:
- name: id
in: path
description: The unique identifier of entity. This identifier can be either an HuBMAP ID (e.g. HBM123.ABCD.456) or UUID
required: true
schema:
type: string
responses:
'200':
description: Entity and its related data including ancestors, descendants, immediate relationships, and optional donor and sample data
content:
application/json:
schema:
type: object
examples:
reindexinfoexample:
summary: 'An example of a reindex-info response'
value:
entity: object
ancestors: [entity objects]
descendants: [entity objects]
immediate_ancestors: [entity objects]
immediate_descendants: [entity objects]
donor: object
origin_samples: [entity objects]
source_samples: [entity objects]
'400':
description: Invalid or misformatted entity identifier
'401':
description: The user's token has expired or the user did not supply a valid token
'403':
description: The user is not authorized to access the entity
'404':
description: The target entity could not be found
'500':
description: An unexpected error occurred
'/entities/{id}/dataset-documents':
get:
summary: Retrieve processed dataset documents associated with a collection, epicollection, or upload
parameters:
- name: id
in: path
description: The unique identifier of entity. This identifier can be either an HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of a Collection, Epicollection, or Upload
required: true
schema:
type: string
responses:
'200':
description: A JSON object mapping dataset UUIDs to their processed document representations enriched via the trigger pipeline and normalized for response
content:
application/json:
schema:
type: object
additionalProperties:
type: object
examples:
datasetdocumentsexample:
summary: 'An example of a dataset-documents response'
value:
abcd1234-ef56-gh78-ij90-klmnop123456:
entity_type: 'Dataset'
uuid: 'abcd1234-ef56-gh78-ij90-klmnop123456'
hubmap_id: 'HBM123.ABCD.456'
status: 'Published'
data_types: ['CODEX']
wxyz5678-ab12-cd34-ef56-ghijklmnopqr:
entity_type: 'Dataset'
uuid: 'wxyz5678-ab12-cd34-ef56-ghijklmnopqr'
hubmap_id: 'HBM456.WXYZ.789'
status: 'Published'
data_types: ['MxIF']
'400':
description: Invalid or misformatted entity identifier
'401':
description: The user's token has expired or the user did not supply a valid token
'403':
description: The user is not authorized to access the entity
'404':
description: The target entity could not be found
'500':
description: An unexpected error occurred
112 changes: 112 additions & 0 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -941,6 +941,118 @@ def get_document_by_id(id):
result_dict = _get_metadata_by_id(entity_id=id, metadata_scope=MetadataScopeEnum.INDEX)
return jsonify(result_dict)

"""
Retrieve all data required to reindex a given entity

Parameters
----------
uuid : str
The HuBMAP ID or UUID of the target entity

Returns
-------
json
Entity and its related data (ancestors,
descendants, immediate relationships, and optional donor and sample data)
"""
@app.route('/entities/<uuid>/reindex-info', methods=['GET'])
def get_reindex_info(uuid):
validate_token_if_auth_header_exists(request)
token = get_internal_token()

raw = app_neo4j_queries.get_reindex_info_raw(neo4j_driver_instance, uuid)
if raw is None:
return not_found_error(f"Entity {uuid} not found")

def run_triggers(entity_dict):
try:
generated = schema_manager.generate_triggered_data(
trigger_type=TriggerTypeEnum.ON_INDEX,
normalized_class=entity_dict['entity_type'],
request_args=request.args,
user_token=token,
existing_data_dict=entity_dict,
new_data_dict={},
properties_to_skip=[]
)
complete = schema_manager.remove_none_values({**entity_dict, **generated})
return schema_manager.normalize_document_result_for_response(entity_dict=complete)
except Exception as e:
logger.error(f"Trigger pipeline failed for {entity_dict.get('uuid')}: {e}")
return entity_dict
# There can be a ton of overlap between the multiple values we're fetching, so we don't want to run the triggers on the same data more
# than once. So a simple cache is useful here.
triggered_cache = {}

def run_triggers_cached(entity_dict):
uid = entity_dict.get('uuid')
if uid not in triggered_cache:
triggered_cache[uid] = run_triggers(entity_dict)
return triggered_cache[uid]

result = {
"entity": run_triggers(raw["entity"]),
"ancestors": [run_triggers_cached(e) for e in raw["ancestors"]],
"descendants": [run_triggers_cached(e) for e in raw["descendants"]],
"immediate_ancestors": [run_triggers_cached(e) for e in raw["immediate_ancestors"]],
"immediate_descendants": [run_triggers_cached(e) for e in raw["immediate_descendants"]],
}
if raw.get("donor"):
result["donor"] = run_triggers_cached(raw["donor"])
if raw.get("donors"):
result["donors"] = [run_triggers_cached(e) for e in raw["donors"]]
if raw.get("origin_samples"):
result["origin_samples"] = [run_triggers_cached(e) for e in raw["origin_samples"]]
if raw.get("source_samples"):
result["source_samples"] = [run_triggers_cached(e) for e in raw["source_samples"]]
resp_body = json.dumps(result).encode('utf-8')
try_resp = try_stash_response_body(resp_body)
if try_resp is not None:
return try_resp
return jsonify(result)

"""
Retrieve processed dataset documents associated with a collection or upload

Parameters
----------
uuid : str
The UUID of the target entity (Collection, Epicollection, or Upload)

Returns
-------
json
A JSON object mapping dataset UUIDs to their processed document representations.
Each dataset is enriched via the trigger pipeline (ON_INDEX), normalized for response,
and stripped of selected large or unnecessary fields (e.g., ingest_metadata, metadata, files).
Returns a 404 error if the entity is not found.
"""
@app.route('/entities/<uuid>/dataset-documents', methods=['GET'])
def get_dataset_documents(uuid):
validate_token_if_auth_header_exists(request)
token = get_internal_token()

entity_record = app_neo4j_queries.get_dataset_documents_raw(neo4j_driver_instance, uuid)
if entity_record is None:
return not_found_error(f"Entity {uuid} not found")

result = {}
for dataset_uuid, entity_dict in entity_record.items():
try:
complete = schema_manager.remove_none_values({**entity_dict})
final = schema_manager.normalize_document_result_for_response(entity_dict=complete)
for field in ['ingest_metadata', 'metadata', 'files']:
final.pop(field, None)
result[dataset_uuid] = final
except Exception as e:
logger.error(f"Failed to process document for {dataset_uuid}: {e}")
continue

resp_body = json.dumps(result).encode('utf-8')
try_resp = try_stash_response_body(resp_body)
if try_resp is not None:
return try_resp
return jsonify(result)

"""
Retrive the full tree above the referenced entity and build the provenance document
Expand Down
158 changes: 158 additions & 0 deletions src/app_neo4j_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,165 @@ def get_ancestor_organs(neo4j_driver, entity_uuid):

return results

"""
Retrieve raw Neo4j node properties required for reindexing

Parameters
----------
neo4j_driver : neo4j.Driver object
The Neo4j database connection pool
entity_uuid : str
The HuBMAP ID or UUID of the target entity

Returns
-------
dict
A dictionary of node properties matching the structure of get_entity(),
including related entities such as ancestors, descendants, donor,
origin_samples, source_samples, and immediate relationships
"""
def get_reindex_info_raw(neo4j_driver, uuid):

with neo4j_driver.session() as session:

# Target entity
entity_record = session.run("""
MATCH (e:Entity {uuid: $uuid})
RETURN properties(e) AS entity
""", uuid=uuid).single()
if not entity_record:
return None
entity = dict(entity_record["entity"])
entity_type = entity["entity_type"]

ancestors_record = session.run("""
MATCH (e:Entity {uuid: $uuid})
OPTIONAL MATCH (e)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(a:Entity)
WHERE a.entity_type <> 'Lab'
RETURN apoc.coll.toSet(COLLECT(properties(a))) AS ancestors
""", uuid=uuid).single()
ancestors = [dict(a) for a in (ancestors_record["ancestors"] or [])]

descendants_record = session.run("""
MATCH (e:Entity {uuid: $uuid})
OPTIONAL MATCH (e)-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]->(d:Entity)
RETURN apoc.coll.toSet(COLLECT(properties(d))) AS descendants
""", uuid=uuid).single()
descendants = [dict(d) for d in (descendants_record["descendants"] or [])]

immediate_record = session.run("""
MATCH (e:Entity {uuid: $uuid})
OPTIONAL MATCH (e)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(parent:Entity)
WHERE parent.entity_type <> 'Lab'
WITH e, apoc.coll.toSet(COLLECT(properties(parent))) AS immediate_ancestors
OPTIONAL MATCH (e)-[:ACTIVITY_INPUT]->(:Activity)-[:ACTIVITY_OUTPUT]->(child:Entity)
RETURN immediate_ancestors,
apoc.coll.toSet(COLLECT(properties(child))) AS immediate_descendants
""", uuid=uuid).single()
immediate_ancestors = [dict(p) for p in (immediate_record["immediate_ancestors"] or [])]
immediate_descendants = [dict(c) for c in (immediate_record["immediate_descendants"] or [])]

result = {
"entity": entity,
"ancestors": ancestors,
"descendants": descendants,
"immediate_ancestors": immediate_ancestors,
"immediate_descendants": immediate_descendants,
}

if entity_type.lower() in ['sample', 'dataset', 'publication']:
donors_record = session.run("""
MATCH (e:Entity {uuid: $uuid})<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Donor)
RETURN apoc.coll.toSet(COLLECT(properties(d))) AS donors
""", uuid=uuid).single()
donors = [dict(d) for d in (donors_record["donors"] or [])]
donor = donors[0] if donors else None
if donors is not None:
result['donors'] = donors
if donor is not None:
result['donor'] = donor

origin_samples_record = session.run("""
MATCH (e:Entity {uuid: $uuid})
OPTIONAL MATCH (e)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Entity)
WHERE s.entity_type = 'Sample'
AND s.sample_category IS NOT NULL
AND toLower(s.sample_category) = 'organ'
AND s.organ IS NOT NULL
AND trim(s.organ) <> ''
RETURN apoc.coll.toSet(COLLECT(properties(s))) AS origin_samples
""", uuid=uuid).single()
origin_samples = [dict(s) for s in (origin_samples_record["origin_samples"] or [])]

if (entity_type == 'Sample'
and entity.get('sample_category', '').lower() == 'organ'
and entity.get('organ', '').strip() != ''):
origin_samples = [entity]
if origin_samples is not None:
result['origin_samples'] = origin_samples

if entity_type.lower() in ['dataset', 'publication']:
source_record = session.run("""
MATCH (e:Entity {uuid: $uuid})
MATCH (e)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Entity {entity_type: 'Sample'})
WHERE NOT EXISTS {
MATCH (s)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(closer:Entity {entity_type: 'Sample'})
MATCH (closer)-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]->(e)
}
RETURN apoc.coll.toSet(COLLECT(properties(s))) AS source_samples
""", uuid=uuid).single()
result["source_samples"] = [dict(s) for s in (source_record["source_samples"] or [])]
return result

"""
Retrieve dataset documents associated with a collection or upload

Parameters
----------
neo4j_driver : neo4j.Driver object
The Neo4j database connection pool
uuid : str
The UUID of the target entity (Collection, Epicollection, or Upload)

Returns
-------
dict
A dictionary mapping dataset UUIDs to their node properties for all datasets
directly linked to the given entity via the appropriate relationship
(IN_COLLECTION or IN_UPLOAD). Returns an empty dictionary if no datasets
are found, or None if the input UUID does not correspond to a supported
entity type.
"""
def get_dataset_documents_raw(neo4j_driver, uuid):
with neo4j_driver.session() as session:
entity_record = session.run("""
MATCH (e:Entity {uuid: $uuid})
RETURN e.entity_type AS entity_type
""", uuid=uuid).single()
if not entity_record:
return None
entity_type = entity_record["entity_type"]

if entity_type in ['Collection', 'Epicollection']:
relationship = 'IN_COLLECTION'
root_label = 'Collection'
elif entity_type == 'Upload':
relationship = 'IN_UPLOAD'
root_label = 'Upload'
else:
return None

record = session.run(f"""
MATCH (root:{root_label} {{uuid: $uuid}})<-[:{relationship}]-(d:Dataset)
RETURN apoc.map.fromPairs(COLLECT([d.uuid, properties(d)])) AS result
""", uuid=uuid).single()

if not record or not record["result"]:
return {}

return {uuid: dict(props) for uuid, props in record["result"].items()}


"""
Create multiple sample nodes in neo4j

Expand Down
Loading