From b785b3ae65c83f0edca4434f8f5c55b56fd48e9c Mon Sep 17 00:00:00 2001 From: ywilke Date: Tue, 25 Jun 2024 12:23:54 +0000 Subject: [PATCH] Improve performance of pdb to parquet conversion. --- .../base/atom_types/atom_types.py | 20 ++++++------------- .../base/atom_types/atom_types.py | 9 +++++---- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/src/binder_classification/base/atom_types/atom_types.py b/src/binder_classification/base/atom_types/atom_types.py index 8ae5290..675ba16 100644 --- a/src/binder_classification/base/atom_types/atom_types.py +++ b/src/binder_classification/base/atom_types/atom_types.py @@ -708,25 +708,17 @@ def get_types( if return_occupancy_value: pdb_df = PandasPdb().read_pdb(str(inf)).df["ATOM"] pdb_df = pdb_df.set_index(["x_coord", "y_coord", "z_coord"]) + occupancy_lookup = pdb_df["occupancy"].astype(int).to_dict() for i in range(len(df)): - row = df.iloc[i] - - #try: - # occupancy = int(pdb_df.loc[row["x"], row["y"], row["z"]]["occupancy"]) - #except: - # print('occ error with', inf) - # print(pdb_df.loc[row["x"], row["y"], row["z"]]["occupancy"]) - # pass - - occupancy = int( - pdb_df.loc[row["x"], row["y"], row["z"]]["occupancy"] - ) + x = xs[i] + y = ys[i] + z = zs[i] + + occupancy = occupancy_lookup[(x, y, z)] occupancy_values.append(occupancy) return types, occupancy_values - - def run( self, inf: str, diff --git a/src/ddg_regression/base/atom_types/atom_types.py b/src/ddg_regression/base/atom_types/atom_types.py index af888b5..0791366 100644 --- a/src/ddg_regression/base/atom_types/atom_types.py +++ b/src/ddg_regression/base/atom_types/atom_types.py @@ -708,12 +708,13 @@ def get_types( if return_occupancy_value: pdb_df = PandasPdb().read_pdb(str(inf)).df["ATOM"] pdb_df = pdb_df.set_index(["x_coord", "y_coord", "z_coord"]) + occupancy_lookup = pdb_df["occupancy"].astype(int).to_dict() for i in range(len(df)): - row = df.iloc[i] + x = xs[i] + y = ys[i] + z = zs[i] - occupancy = int( - pdb_df.loc[row["x"], row["y"], row["z"]]["occupancy"] - ) + occupancy = occupancy_lookup[(x, y, z)] occupancy_values.append(occupancy) return types, occupancy_values