Source code for npfc.fragment_search

"""
Module fragment_search
==========================
This modules contains the function to run substructure searches.
"""


# standard
import logging
# data handling
from pandas import DataFrame
# chemoinformatics
from rdkit.Chem import rdTautomerQuery


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #


[docs]def get_fragment_hits(df_mols: DataFrame, df_frags: DataFrame, col_mol_mols: str = 'mol', col_mol_frags: str = 'mol', col_mol_inchikey: str = 'inchikey', fcp_labels: str = None, tautomer: bool = False, col_to_index_mols: str = 'idm', col_to_index_frags: str = 'idm', ) -> DataFrame: """Create a DataFrame recording every Fragment Hit in the input molecule DataFrame. A Fragment Hit is composed of 6 fields: 1) idm: the id of the molecule (rowid from df_mols) 2) idf: the id of the fragment (rowid from df_frags) 3) aidxf: the atom indices of the fragment found in the molecule 4) mol_perc: the percentage of the molecule the fragment represents (based on hac) 5) mol: the molecule as RDKit Mol object 6) mol_frag: the fragment as RDKit Mol object :param df_mols: the input DataFrame with the molecules (df_mols) :param df_frags: the input DataFrame with fragments to use for substructure search (df_frags) :param col_mol_mols: the column name in df_mols with the molecules :param col_mol_frags: the column name in df_frags with the fragments :param col_mol_inchikey: the input DataFrame column name with the inchikey of the molecule. If it does not exist, then an empty column is created in the output. :param fcp_labels: the column name in the fragments dataframe with the fcp labels :param tautomer: if set to True, tautomers will be taken into account during fragment search (warning, tautomer-independant search is much slower!) :param col_to_index_mols: set the row indices of the DataFrame with the molecules to probe to the specified column. If empty (''), indices are left untouched. :param col_to_index_frags: set the row indices of the DataFrame with the fragments to seach for to the specified column. If empty (''), indices are left untouched. :return: the substructure matches as a DataFrame .. warning:: Row indices are used for recording the ids of substructure hits and are therefore required to be set to the molecule identifiers (i.e. idm). """ # init d = {} d['idm'] = [] d['idf'] = [] d['_aidxf'] = [] d['mol_perc'] = [] # proportion of the molecule the substructure represents d['mol'] = [] # encode the molecule here so we don't have to combine multiple files when trying to have a look at the results d['mol_frag'] = [] # strucutre of the fragment d['inchikey'] = [] # inchikey of the molecule # row ids are used for storing the correct molecule and fragment ids in the # output table, so they need to be set before the computation occurs if col_to_index_mols != '': df_mols.index = list(df_mols[col_to_index_mols]) if col_to_index_frags != '': df_frags.index = list(df_frags[col_to_index_frags]) # if no inchikey was computed, generate an empty column if col_mol_inchikey not in df_mols.columns: df_mols['inchikey'] = '' # tautomers if tautomer: df_mols[col_mol_mols + '_taut'] = df_mols[col_mol_mols].map(lambda x: rdTautomerQuery.TautomerQuery(x).GetTemplateMolecule()) # begin for i in range(len(df_mols.index)): rowm = df_mols.iloc[i] if tautomer: mol = rowm[col_mol_mols + '_taut'] else: mol = rowm[col_mol_mols] hac = mol.GetNumAtoms() for j in range(len(df_frags.index)): rowq = df_frags.iloc[j] # perform the substructure search on mol so the latest matching fragment does not get highlighted matches = mol.GetSubstructMatches(rowq[col_mol_frags]) if len(matches) > 0: logging.debug("MOL %s + FRAG %s ==> %s", rowm.name, rowq.name, matches) for m in matches: d['idm'].append(rowm.name) d['idf'].append(rowq.name) d['_aidxf'].append(m) # frozenset so we can use intersection, etc. and still remove dupl. easily d['mol_perc'].append(round(len(m)/hac, 2) * 100) d['mol'].append(rowm[col_mol_mols]) d['inchikey'].append(rowm[col_mol_inchikey]) d['mol_frag'].append(rowq[col_mol_frags]) df_fs = DataFrame(d) df_fs['idf_idx'] = df_fs.groupby(['idm', 'idf']).cumcount() # rank seems to be working with np.float types only... # add fcp labels if fcp_labels is not None and fcp_labels != '': df_frags = df_frags.rename({'idm': 'idf'}, axis=1) df_fs = df_fs.merge(df_frags[['idf', fcp_labels]], on='idf', how='left') return df_fs.rename({fcp_labels: '_fcp_labels'}, axis=1)