Source code for npfc.fragment_search

"""
Module fragment_search
==========================
This modules contains the function to run substructure searches.
"""


# standard
import logging
# data handling
from pandas import DataFrame
# chemoinformatics
from rdkit.Chem import rdTautomerQuery


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #


[docs]def get_fragment_hits(df_mols: DataFrame,
                      df_frags: DataFrame,
                      col_mol_mols: str = 'mol',
                      col_mol_frags: str = 'mol',
                      col_mol_inchikey: str = 'inchikey',
                      fcp_labels: str = None,
                      tautomer: bool = False,
                      col_to_index_mols: str = 'idm',
                      col_to_index_frags: str = 'idm',
                      ) -> DataFrame:
    """Create a DataFrame recording every Fragment Hit in the
    input molecule DataFrame.

    A Fragment Hit is composed of 6 fields:

    1) idm: the id of the molecule (rowid from df_mols)
    2) idf: the id of the fragment (rowid from df_frags)
    3) aidxf: the atom indices of the fragment found in the molecule
    4) mol_perc: the percentage of the molecule the fragment represents (based on hac)
    5) mol: the molecule as RDKit Mol object
    6) mol_frag: the fragment as RDKit Mol object

    :param df_mols: the input DataFrame with the molecules (df_mols)
    :param df_frags: the input DataFrame with fragments to use for substructure search (df_frags)
    :param col_mol_mols: the column name in df_mols with the molecules
    :param col_mol_frags: the column name in df_frags with the fragments
    :param col_mol_inchikey: the input DataFrame column name with the inchikey of the molecule. If it does not exist, then an empty column is created in the output.
    :param fcp_labels: the column name in the fragments dataframe with the fcp labels
    :param tautomer: if set to True, tautomers will be taken into account during fragment search (warning, tautomer-independant search is much slower!)
    :param col_to_index_mols: set the row indices of the DataFrame with the molecules to probe to the specified column. If empty (''), indices are left untouched.
    :param col_to_index_frags: set the row indices of the DataFrame with the fragments to seach for to the specified column. If empty (''), indices are left untouched.

    :return: the substructure matches as a DataFrame

    .. warning:: Row indices are used for recording the ids of substructure hits and are therefore required to be set to the molecule identifiers (i.e. idm).

    """
    # init
    d = {}
    d['idm'] = []
    d['idf'] = []
    d['_aidxf'] = []
    d['mol_perc'] = []  # proportion of the molecule the substructure represents
    d['mol'] = []  # encode the molecule here so we don't have to combine multiple files when trying to have a look at the results
    d['mol_frag'] = []  # strucutre of the fragment
    d['inchikey'] = []  # inchikey of the molecule

    # row ids are used for storing the correct molecule and fragment ids in the
    #  output table, so they need to be set before the computation occurs
    if col_to_index_mols != '':
        df_mols.index = list(df_mols[col_to_index_mols])
    if col_to_index_frags != '':
        df_frags.index = list(df_frags[col_to_index_frags])

    # if no inchikey was computed, generate an empty column
    if col_mol_inchikey not in df_mols.columns:
        df_mols['inchikey'] = ''

    # tautomers
    if tautomer:
        df_mols[col_mol_mols + '_taut'] = df_mols[col_mol_mols].map(lambda x: rdTautomerQuery.TautomerQuery(x).GetTemplateMolecule())

    # begin
    for i in range(len(df_mols.index)):
        rowm = df_mols.iloc[i]
        if tautomer:
            mol = rowm[col_mol_mols + '_taut']
        else:
            mol = rowm[col_mol_mols]
        hac = mol.GetNumAtoms()
        for j in range(len(df_frags.index)):
            rowq = df_frags.iloc[j]
            # perform the substructure search on mol so the latest matching fragment does not get highlighted
            matches = mol.GetSubstructMatches(rowq[col_mol_frags])
            if len(matches) > 0:
                logging.debug("MOL %s + FRAG %s ==> %s", rowm.name, rowq.name, matches)
                for m in matches:
                    d['idm'].append(rowm.name)
                    d['idf'].append(rowq.name)
                    d['_aidxf'].append(m)  # frozenset so we can use intersection, etc. and still remove dupl. easily
                    d['mol_perc'].append(round(len(m)/hac, 2) * 100)
                    d['mol'].append(rowm[col_mol_mols])
                    d['inchikey'].append(rowm[col_mol_inchikey])
                    d['mol_frag'].append(rowq[col_mol_frags])

    df_fs = DataFrame(d)
    df_fs['idf_idx'] = df_fs.groupby(['idm', 'idf']).cumcount()  # rank seems to be working with np.float types only...

    # add fcp labels
    if fcp_labels is not None and fcp_labels != '':
        df_frags = df_frags.rename({'idm': 'idf'}, axis=1)
        df_fs = df_fs.merge(df_frags[['idf', fcp_labels]], on='idf', how='left')

    return df_fs.rename({fcp_labels: '_fcp_labels'}, axis=1)