From 3afced09cd1729d3d94935339740c1e971ff2311 Mon Sep 17 00:00:00 2001 From: kburke <209327+kburke@users.noreply.github.com> Date: Mon, 15 Dec 2025 08:44:31 -0500 Subject: [PATCH 1/6] Switch loops over Neo4j calls to single call for entity-api#974 --- src/schema/schema_constants.py | 10 +++++ src/schema/schema_neo4j_queries.py | 66 ++++++++++++++++++++++++++---- src/schema/schema_validators.py | 62 ++++++++++++---------------- 3 files changed, 94 insertions(+), 44 deletions(-) diff --git a/src/schema/schema_constants.py b/src/schema/schema_constants.py index 957b5d06..7bbc7c14 100644 --- a/src/schema/schema_constants.py +++ b/src/schema/schema_constants.py @@ -54,3 +54,13 @@ class TriggerTypeEnum(Enum): BEFORE_UPDATE = 'before_update_trigger' AFTER_CREATE = 'after_create_trigger' AFTER_UPDATE = 'after_update_trigger' + +# Define an enumeration of accepted Neo4j relationship types. +class Neo4jRelationshipEnum(Enum): + ACTIVITY_INPUT = 'ACTIVITY_INPUT' + ACTIVITY_OUTPUT = 'ACTIVITY_INPUT' + IN_COLLECTION = 'IN_COLLECTION' + N_UPLOAD = 'N_UPLOAD' + REVISION_OF = 'REVISION_OF' + USES_DATA = 'USES_DATA' + diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py index e3463947..d30e19d9 100644 --- a/src/schema/schema_neo4j_queries.py +++ b/src/schema/schema_neo4j_queries.py @@ -1,5 +1,7 @@ +import neo4j from neo4j.exceptions import TransactionError -from schema.schema_constants import SchemaConstants +from neo4j import Session as Neo4jSession +from schema.schema_constants import SchemaConstants, Neo4jRelationshipEnum import logging logger = logging.getLogger(__name__) @@ -7,7 +9,6 @@ # The filed name of the single result record record_field_name = 'result' - #################################################################################################### ## Functions can be called by app.py, schema_manager.py, and schema_triggers.py #################################################################################################### @@ -109,7 +110,37 @@ def get_entity(neo4j_driver, uuid): return result +""" +Given a list of UUIDs, return a dict mapping uuid -> entity_node +Only UUIDs present in Neo4j will be returned. + +Parameters +---------- +neo4j_driver : neo4j.Driver object + The neo4j database connection pool +uuid_list : list of str + The uuids of target entities to retrieve from Neo4j + +Returns +------- +dict + A dictionary of entity details returned from the Cypher query, keyed by + the uuid provided in uuid_list. +""" +def get_entities(neo4j_driver, uuid_list): + + if not uuid_list: + return {} + + query = """ + MATCH (e:Entity) + WHERE e.uuid IN $param_uuids + RETURN e.uuid AS uuid, e AS entity + """ + with neo4j_driver.session() as session: + results = session.run(query, param_uuids=uuid_list) + return {record["uuid"]: record["entity"] for record in results} """ Get the uuids for each entity in a list that doesn't belong to a certain entity type. Uuids are ordered by type @@ -889,13 +920,11 @@ def link_collection_to_datasets(neo4j_driver, collection_uuid, dataset_uuid_list _delete_collection_linkages_tx(tx=tx , uuid=collection_uuid) - # Create relationship from each member Dataset node to this Collection node - for dataset_uuid in dataset_uuid_list: - create_relationship_tx(tx=tx - , source_node_uuid=dataset_uuid - , direction='->' - , target_node_uuid=collection_uuid - , relationship='IN_COLLECTION') + _create_relationships_unwind_tx(tx=tx + , source_uuid_list=dataset_uuid_list + , target_uuid=collection_uuid + , relationship=Neo4jRelationshipEnum.IN_COLLECTION + , direction='->') tx.commit() except TransactionError as te: @@ -1980,6 +2009,25 @@ def create_relationship_tx(tx, source_node_uuid, target_node_uuid, relationship, result = tx.run(query) +def _create_relationships_unwind_tx(tx:Neo4jSession, source_uuid_list:list, target_uuid:str + , relationship:Neo4jRelationshipEnum, direction:str)->None: + logger.info("====== enter _create_relationships_unwind_tx() ======") + incoming = direction if direction == "<-" else "-" + outgoing = direction if direction == "->" else "-" + + query = ( + f"MATCH (t {{uuid: $target_uuid}}) " + f"UNWIND $source_uuid_list AS src_uuid " + f"MATCH (s {{uuid: src_uuid}}) " + f"CREATE (s){incoming}[r:{relationship.value}]{outgoing}(t) " + f"RETURN src_uuid AS linked_uuid" + ) + + result = tx.run( query=query + , target_uuid=target_uuid + , source_uuid_list=source_uuid_list) + logger.info("====== returning from _create_relationships_unwind_tx() ======") + """ Execute one query to create all outgoing relationships from each node whose identifier is in the source node list to the target Activity node in neo4j diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py index 078f319a..bc9eef19 100644 --- a/src/schema/schema_validators.py +++ b/src/schema/schema_validators.py @@ -298,7 +298,8 @@ def verify_DOI_pair(property_key, normalized_entity_type, request, existing_data f" the prefix {SchemaConstants.DOI_BASE_URL}.") """ -Validate every entity in a list is of entity_type accepted +Validate every entity in a list is of entity_type that can be in a +Collection and already exists in Neo4j. Parameters ---------- @@ -318,41 +319,32 @@ def collection_entities_are_existing_datasets(property_key, normalized_entity_ty # Verify each UUID specified exists in the uuid-api, exists in Neo4j, and is for a Dataset before # proceeding with creation of Collection. bad_dataset_uuids = [] - for dataset_uuid in new_data_dict['dataset_uuids']: - try: - ## The following code duplicates some functionality existing in app.py, in - ## query_target_entity(), which also deals with caching. In the future, the - ## validation logic shared by this file and app.py should become a utility - ## module, shared by validators as well as app.py. But for now, the code - ## is repeated for the following. - - # Get cached ids if exist otherwise retrieve from UUID-API. Expect an - # Exception to be raised if not found. - dataset_uuid_entity = schema_manager.get_hubmap_ids(id=dataset_uuid) - - # If the uuid exists per the uuid-api, make sure it also exists as a Neo4j entity. - uuid = dataset_uuid_entity['uuid'] - entity_dict = schema_neo4j_queries.get_entity(schema_manager.get_neo4j_driver_instance(), dataset_uuid) - - # If dataset_uuid is not found in Neo4j or is not for a Dataset, fail the validation. - if not entity_dict: - logger.info(f"Request for {dataset_uuid} inclusion in Collection," - f" but not found in Neo4j.") - bad_dataset_uuids.append(dataset_uuid) - elif entity_dict['entity_type'] != 'Dataset': - logger.info(f"Request for {dataset_uuid} inclusion in Collection," - f" but entity_type={entity_dict['entity_type']}, not Dataset.") - bad_dataset_uuids.append(dataset_uuid) - except Exception as nfe: - # If the dataset_uuid is not found, fail the validation. - logger.info(f"Request for {dataset_uuid} inclusion in Collection" - f" failed uuid-api retrieval.") - bad_dataset_uuids.append(dataset_uuid) + dataset_uuid_list = new_data_dict['dataset_uuids'] + if not dataset_uuid_list: + return + + existing_uuid_entities = schema_neo4j_queries.get_entities( neo4j_driver=schema_manager.get_neo4j_driver_instance() + , uuid_list=dataset_uuid_list) + + # If any UUIDs which were passed in do not exist in Neo4j or are not Datasets, identify them + missing_uuid_set = set(dataset_uuid_list) - set(existing_uuid_entities) + if missing_uuid_set: + logger.info(f"Request for inclusion in Collection but not found in Neo4j:" + f" {sorted(missing_uuid_set)}") + + non_dataset_uuid_set = set() + for uuid, neo4j_entity in existing_uuid_entities.items(): + if neo4j_entity['entity_type'] != 'Dataset': + non_dataset_uuid_set.add(uuid) + if non_dataset_uuid_set: + logger.info(f"Request for inclusion in Collection, but non-Dataset entities in Neo4j:" + f" {sorted(non_dataset_uuid_set)}") + # If any uuids in the request dataset_uuids are not for an existing Dataset entity which - # exists in uuid-api and Neo4j, raise an Exception so the validation fails and the - # operation can be rejected. - if bad_dataset_uuids: - raise ValueError(f"Unable to find Datasets for {bad_dataset_uuids}.") + # exists in Neo4j, raise an Exception so the validation fails and the operation can be rejected. + if missing_uuid_set or non_dataset_uuid_set: + raise ValueError(f"Unable to find Datasets for" + f" {sorted(missing_uuid_set.union(non_dataset_uuid_set))}") """ Validate the provided value of Dataset.status on update via PUT From 1cff0da3b7644c99ac9451d0349a95f00b7f93b1 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Mon, 15 Dec 2025 13:58:54 -0500 Subject: [PATCH 2/6] Bump version from 2.6.9 to 2.6.10 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index d48d3702..a04abec9 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.6.9 +2.6.10 From 57dbb4476b0afd5725aeb994f84a4f86b072e9d5 Mon Sep 17 00:00:00 2001 From: kburke <209327+kburke@users.noreply.github.com> Date: Mon, 15 Dec 2025 14:56:06 -0500 Subject: [PATCH 3/6] Correct enum constant identified by code review --- src/schema/schema_constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/schema/schema_constants.py b/src/schema/schema_constants.py index 7bbc7c14..b8d6efda 100644 --- a/src/schema/schema_constants.py +++ b/src/schema/schema_constants.py @@ -60,7 +60,7 @@ class Neo4jRelationshipEnum(Enum): ACTIVITY_INPUT = 'ACTIVITY_INPUT' ACTIVITY_OUTPUT = 'ACTIVITY_INPUT' IN_COLLECTION = 'IN_COLLECTION' - N_UPLOAD = 'N_UPLOAD' + IN_UPLOAD = 'IN_UPLOAD' REVISION_OF = 'REVISION_OF' USES_DATA = 'USES_DATA' From 8eaa9dee185bfee8c2d1a832e6aa87625c230207 Mon Sep 17 00:00:00 2001 From: kburke <209327+kburke@users.noreply.github.com> Date: Mon, 15 Dec 2025 15:10:00 -0500 Subject: [PATCH 4/6] Add documentation for new _create_relationships_unwind_tx() method, requested during code review. --- src/schema/schema_neo4j_queries.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py index d30e19d9..804f9b45 100644 --- a/src/schema/schema_neo4j_queries.py +++ b/src/schema/schema_neo4j_queries.py @@ -2009,6 +2009,24 @@ def create_relationship_tx(tx, source_node_uuid, target_node_uuid, relationship, result = tx.run(query) +""" +Create multiple relationships between a target node and each node in +a list of source nodes in neo4j + +Parameters +---------- +tx : neo4j.Session object + The neo4j.Session object instance +source_uuid_list : list[str] + A list of UUIDs for nodes which will have a relationship to the node with target_uuid +target_uuid : str + The UUID of target node +relationship : Neo4jRelationshipEnum + The string for the Neo4j relationship type between each source node and the target node. +direction: str + The relationship direction of each source node to the target node: outgoing `->` or incoming `<-` + Neo4j CQL CREATE command supports only directional relationships +""" def _create_relationships_unwind_tx(tx:Neo4jSession, source_uuid_list:list, target_uuid:str , relationship:Neo4jRelationshipEnum, direction:str)->None: logger.info("====== enter _create_relationships_unwind_tx() ======") From d4aed5ebe87710b23815f248512be9de0a374735 Mon Sep 17 00:00:00 2001 From: kburke <209327+kburke@users.noreply.github.com> Date: Mon, 15 Dec 2025 16:17:38 -0500 Subject: [PATCH 5/6] Eliminate looping results sets and providing specific error msgs --- src/schema/schema_neo4j_queries.py | 11 ++++++----- src/schema/schema_validators.py | 24 ++++++------------------ 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py index 804f9b45..6ab089b8 100644 --- a/src/schema/schema_neo4j_queries.py +++ b/src/schema/schema_neo4j_queries.py @@ -127,20 +127,21 @@ def get_entity(neo4j_driver, uuid): A dictionary of entity details returned from the Cypher query, keyed by the uuid provided in uuid_list. """ -def get_entities(neo4j_driver, uuid_list): +def get_existing_dataset_entities(neo4j_driver, dataset_uuid_list:list): - if not uuid_list: + if not dataset_uuid_list: return {} query = """ MATCH (e:Entity) WHERE e.uuid IN $param_uuids - RETURN e.uuid AS uuid, e AS entity + AND e.entity_type='Dataset' + RETURN e.uuid AS uuid """ with neo4j_driver.session() as session: - results = session.run(query, param_uuids=uuid_list) - return {record["uuid"]: record["entity"] for record in results} + results = session.run(query, param_uuids=dataset_uuid_list) + return [record["uuid"] for record in results] """ Get the uuids for each entity in a list that doesn't belong to a certain entity type. Uuids are ordered by type diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py index bc9eef19..1cd1e735 100644 --- a/src/schema/schema_validators.py +++ b/src/schema/schema_validators.py @@ -323,28 +323,16 @@ def collection_entities_are_existing_datasets(property_key, normalized_entity_ty if not dataset_uuid_list: return - existing_uuid_entities = schema_neo4j_queries.get_entities( neo4j_driver=schema_manager.get_neo4j_driver_instance() - , uuid_list=dataset_uuid_list) + existing_datasets_list = schema_neo4j_queries.get_existing_dataset_entities( neo4j_driver=schema_manager.get_neo4j_driver_instance() + , dataset_uuid_list=dataset_uuid_list) # If any UUIDs which were passed in do not exist in Neo4j or are not Datasets, identify them - missing_uuid_set = set(dataset_uuid_list) - set(existing_uuid_entities) + missing_uuid_set = set(dataset_uuid_list) - set(existing_datasets_list) if missing_uuid_set: - logger.info(f"Request for inclusion in Collection but not found in Neo4j:" + logger.info(f"Only existing Datasets may be included in a Collection:" f" {sorted(missing_uuid_set)}") - - non_dataset_uuid_set = set() - for uuid, neo4j_entity in existing_uuid_entities.items(): - if neo4j_entity['entity_type'] != 'Dataset': - non_dataset_uuid_set.add(uuid) - if non_dataset_uuid_set: - logger.info(f"Request for inclusion in Collection, but non-Dataset entities in Neo4j:" - f" {sorted(non_dataset_uuid_set)}") - - # If any uuids in the request dataset_uuids are not for an existing Dataset entity which - # exists in Neo4j, raise an Exception so the validation fails and the operation can be rejected. - if missing_uuid_set or non_dataset_uuid_set: - raise ValueError(f"Unable to find Datasets for" - f" {sorted(missing_uuid_set.union(non_dataset_uuid_set))}") + raise ValueError( f"Only existing Datasets may be included in a Collection, not these: " + f" {sorted(missing_uuid_set)}") """ Validate the provided value of Dataset.status on update via PUT From ab8b63a590d1304bf3b2639d370b79b627a0ea81 Mon Sep 17 00:00:00 2001 From: kburke <209327+kburke@users.noreply.github.com> Date: Mon, 15 Dec 2025 16:29:07 -0500 Subject: [PATCH 6/6] Rename function to align with limited function --- src/schema/schema_neo4j_queries.py | 2 +- src/schema/schema_validators.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py index 6ab089b8..a663e04d 100644 --- a/src/schema/schema_neo4j_queries.py +++ b/src/schema/schema_neo4j_queries.py @@ -127,7 +127,7 @@ def get_entity(neo4j_driver, uuid): A dictionary of entity details returned from the Cypher query, keyed by the uuid provided in uuid_list. """ -def get_existing_dataset_entities(neo4j_driver, dataset_uuid_list:list): +def identify_existing_dataset_entities(neo4j_driver, dataset_uuid_list:list): if not dataset_uuid_list: return {} diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py index 1cd1e735..0ca4acb5 100644 --- a/src/schema/schema_validators.py +++ b/src/schema/schema_validators.py @@ -323,8 +323,8 @@ def collection_entities_are_existing_datasets(property_key, normalized_entity_ty if not dataset_uuid_list: return - existing_datasets_list = schema_neo4j_queries.get_existing_dataset_entities( neo4j_driver=schema_manager.get_neo4j_driver_instance() - , dataset_uuid_list=dataset_uuid_list) + existing_datasets_list = schema_neo4j_queries.identify_existing_dataset_entities( neo4j_driver=schema_manager.get_neo4j_driver_instance() + , dataset_uuid_list=dataset_uuid_list) # If any UUIDs which were passed in do not exist in Neo4j or are not Datasets, identify them missing_uuid_set = set(dataset_uuid_list) - set(existing_datasets_list)