Skip to content

Handle HETATM bioassembly cases and related processing updates#13

Open
jhmlam wants to merge 1 commit intoTHGLab:mainfrom
jhmlam:hetatm-bioassembly
Open

Handle HETATM bioassembly cases and related processing updates#13
jhmlam wants to merge 1 commit intoTHGLab:mainfrom
jhmlam:hetatm-bioassembly

Conversation

@jhmlam
Copy link

@jhmlam jhmlam commented Mar 10, 2026

No description provided.

@jhmlam
Copy link
Author

jhmlam commented Mar 10, 2026

@Ericwang6
Some write-ups to explain the changes a little.

  • New functions (1) refine with hetatm, (2) refine with bioassembly (3) refine with hetatm and bioassembly. I made the all the new features optional. E.g. to run the code with option (1) and option (3) activated:
mamba run -n openmm_cuda3 python -m hiqbind.process \
-i /home/jhml/MessLab/Study-HiQBindForInputProcessing/Wiki-ExternalCode/HiQBind/pre_process/hiq_sm.csv \
-d /home/jhml/MessLab/Study-HiQBindForInputProcessing/data/hiqbind_sm/ \
--hetatm_cutoff 4. \
--binding_cutoff 10. \
--do_refine_structure_with_ligand_plus_hetatm_assembly \
--do_refine_structure_with_ligand_plus_assembly \
--serial

# Ligands with elements other than this list will be discarded
COMMON_ELEMENTS = ['H', 'C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br', 'I']
#
AMBER14_TIP3PXML_ACCEPTABLE_RESNAME = [
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

include['hetatm'].append(residue)
if np.min(cdist(positions, het_positions)) * 10 > hetatm_cutoff:
continue
if residue.name not in AMBER14_TIP3PXML_ACCEPTABLE_RESNAME:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for residue in self.topology.residues():
system = forcefield.createSystem(self.topology, nonbondedMethod=app.CutoffNonPeriodic, constraints=None, rigidWater=False)
original_masses = [system.getParticleMass(i) for i in range(system.getNumParticles())]
#omega_atom_quads = self._get_nonpro_peptide_omega_atom_quads()
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remaining issue, I do see some cis-peptide bonds at the pdbfixer added residues. This is also visible in the Figshare data 1ork.

For now I commented out this block, while it cures some of the cis peptide bond, it doesnt cure all; so I reserve this for the next upgrade.

@jhmlam
Copy link
Author

jhmlam commented Mar 10, 2026

It also writes a json which we can inspect how many system contains certain ions and if bioassembly did anything to the PDB model. (most PDB model only have 1 assembly case).

Also attached a plot generated with the following code reading the jsons after running 1000 systems (sorted pdbids, covering pdbid 10** to 1l**), so for these early pdb structures ~80% carries water and 15% have meaningful assembly.

plot_hetatm_bioassembly_indicator_counts

def plot_hetatm_bioassembly_indicator_counts(
    json_glob="/home/jhml/MessLab/Study-HiQBindForInputProcessing/data/hiqbind_sm/*/stat_indicate_hetatm_bioassembly.json",
    figsize=(10, 6),
    sort_descending=True,
):
    import glob
    import json
    import os
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt

    rows = []
    row_names = []

    for json_file in sorted(glob.glob(json_glob)):
        with open(json_file, "r") as f:
            d = json.load(f)
        rows.append(d)
        row_names.append(os.path.basename(os.path.dirname(json_file.rstrip("/"))))

    df = pd.DataFrame(rows, index=row_names).fillna(0)
    df_indicator = (df > 0).astype(int)

    counts = df_indicator.sum(axis=0)
    counts = counts[counts > 0]

    if sort_descending:
        counts = counts.sort_values(ascending=False)

    df_counts = counts.rename_axis("indicator").reset_index(name="count")

    plt.figure(figsize=figsize)
    ax = sns.barplot(
        data=df_counts,
        x="indicator",
        y="count",
        color="blue",
        edgecolor="black",
        errorbar=None,
    )

    ax.set_xlabel("")
    ax.set_ylabel("Count")
    ax.set_yscale("log")
    ax.tick_params(axis="x", rotation=90)
    import math

    ymax = df_counts["count"].max()
    ymax_plot = 5 * (10 ** math.ceil(math.log10(ymax)))
    ax.set_ylim(0.8, ymax_plot)
    for spine in ax.spines.values():
        spine.set_color("black")

    for patch, count in zip(ax.patches, df_counts["count"].tolist()):
        x = patch.get_x() + patch.get_width() / 2.0
        y = patch.get_height()
        ax.text(
            x,
            y * 1.05,
            str(int(count)),
            ha="center",
            va="bottom",
            color="black",
        )

    plt.tight_layout()
    plt.savefig("./plot_hetatm_bioassembly_indicator_counts.svg")
    return df, df_indicator, df_counts, ax
df_raw, df_indicator, df_counts, ax = plot_hetatm_bioassembly_indicator_counts(json_glob="/home/jhml/MessLab/Study-HiQBindForInputProcessing/data/hiqbind_sm/*/stat_indicate_hetatm_bioassembly.json",)

@jhmlam
Copy link
Author

jhmlam commented Mar 10, 2026

It also writes a json which we can inspect how many system contains certain ions and if bioassembly did anything to the PDB model. (most PDB model only have 1 assembly case).

Also attached a plot generated with the following code reading the jsons after running 1000 systems (sorted pdbids, covering pdbid 10** to 1l**), so for these early pdb structures ~80% carries water and 15% have meaningful assembly.

plot_hetatm_bioassembly_indicator_counts

def plot_hetatm_bioassembly_indicator_counts(
    json_glob="/home/jhml/MessLab/Study-HiQBindForInputProcessing/data/hiqbind_sm/*/stat_indicate_hetatm_bioassembly.json",
    figsize=(10, 6),
    sort_descending=True,
):
    import glob
    import json
    import os
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt

    rows = []
    row_names = []

    for json_file in sorted(glob.glob(json_glob)):
        with open(json_file, "r") as f:
            d = json.load(f)
        rows.append(d)
        row_names.append(os.path.basename(os.path.dirname(json_file.rstrip("/"))))

    df = pd.DataFrame(rows, index=row_names).fillna(0)
    df_indicator = (df > 0).astype(int)

    counts = df_indicator.sum(axis=0)
    counts = counts[counts > 0]

    if sort_descending:
        counts = counts.sort_values(ascending=False)

    df_counts = counts.rename_axis("indicator").reset_index(name="count")

    plt.figure(figsize=figsize)
    ax = sns.barplot(
        data=df_counts,
        x="indicator",
        y="count",
        color="blue",
        edgecolor="black",
        errorbar=None,
    )

    ax.set_xlabel("")
    ax.set_ylabel("Count")
    ax.set_yscale("log")
    ax.tick_params(axis="x", rotation=90)
    import math

    ymax = df_counts["count"].max()
    ymax_plot = 5 * (10 ** math.ceil(math.log10(ymax)))
    ax.set_ylim(0.8, ymax_plot)
    for spine in ax.spines.values():
        spine.set_color("black")

    for patch, count in zip(ax.patches, df_counts["count"].tolist()):
        x = patch.get_x() + patch.get_width() / 2.0
        y = patch.get_height()
        ax.text(
            x,
            y * 1.05,
            str(int(count)),
            ha="center",
            va="bottom",
            color="black",
        )

    plt.tight_layout()
    plt.savefig("./plot_hetatm_bioassembly_indicator_counts.svg")
    return df, df_indicator, df_counts, ax
df_raw, df_indicator, df_counts, ax = plot_hetatm_bioassembly_indicator_counts(json_glob="/home/jhml/MessLab/Study-HiQBindForInputProcessing/data/hiqbind_sm/*/stat_indicate_hetatm_bioassembly.json",)

Note that occurrence of these water, ions and bio-assembly may be less frequent in later pdbids as the early pdbids are mostly crystals though

@KSUN63 KSUN63 requested a review from Ericwang6 March 13, 2026 14:57
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant