Source code for npfc.fragment_combination

"""
Module fragment_combination
===========================
This modules contains the functions for classifying fragment combinations.
"""
# standard
import logging
import itertools
# chemoinformatics
from rdkit.Chem import Mol
from rdkit.Chem import AllChem
# docs
from pandas import DataFrame
# dev
from npfc import utils


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #


[docs]def get_fragment_combination_categories(include_fp: bool = False) -> list: """Return the list of all possible of Fragment Combinations Categories. :param include_fp: include false positives :return: the list of all possible fragment combination categories """ cats = ['fs', 'fe', 'fb', 'fl', # fusions 'ca', # connection annulated 'cm', # connection monopodal 'cbs', 'cbe', 'cbb', 'cbl', # connections bipodal 'cts', 'cte', 'ctb', 'ctl', # connections tripodal 'cos', 'coe', 'cob', 'col', # connections others ] if include_fp: cats += ['ffs', 'cfc', 'ffo'] # false positives return cats
[docs]def get_rings_between_two_fragments(mol: Mol, aidxf1: set, aidxf2: set) -> list: """Returns the atom indices of every ring that connects two fragments together, defined by atom indices. :param mol: the input molecule :param aidxf1: the atom indices of the first fragment found in the molecule :param aidxf2: the atom indices of the second fragment found in the molecule :return: a list of intermediary rings between both fragments and defined by atom indices """ ri = mol.GetRingInfo() intermediary_rings = [] for i, ar in enumerate(ri.AtomRings()): sar = set(ar) if sar.intersection(aidxf1) and sar.intersection(aidxf2): intermediary_rings.append(sar) return intermediary_rings
[docs]def get_shortest_path_between_frags(mol: Mol, aidxf1: set, aidxf2: set) -> tuple: """Return the shortest path within a molecule between two fragments defined by atom indices. First and last atom indices are part of respectively fragment 1 and fragment 2, so they should not be considered when estimating the distance between fragments. (i.e. distance = len(shortest_path) - 2) :param mol: The input molecule. :param aidxf1: the atom indices of the first fragment found in the molecule :param aidxf2: the atom indices of the second fragment found in the molecule :return: the atom indices of the shortest path between both fragments. The first index is the attachment point from fragment 1 whereas the last index is the attachment point from fragment 2 """ # 1/ compute every pairwise atom combination between both fragments pairwise_combinations = itertools.product(tuple(aidxf1), tuple(aidxf2)) # 2/ for each of those, compute the shortest path possible pairwise_combinations = list(pairwise_combinations) all_paths = [AllChem.GetShortestPath(mol, pc[0], pc[1]) for pc in pairwise_combinations] # logging.debug(f"Looking for the shortest path shortest path among these:") # [logging.debug(f"Path ({str(i).zfill(3)}): {p}") for i, p in enumerate(all_paths)] # 3/ return one of the shortest pathes return min(all_paths, key=lambda x: len(x))
# def _exclude_exocyclic(mol: Mol, aidxf: frozenset) -> frozenset: # """Exclude exocylic atoms from a list of atom indices. # :param mol: the input molecule # :param aidxf: the atom indices # :return: the filtered atom indices # """ # logging.debug("Input aidxf=%s", aidxf) # # all ring atoms in the molecule # ring_atoms = [set(x) for x in mol.GetRingInfo().AtomRings()] # logging.debug("Ring atoms=%s", ring_atoms) # # save all ring atoms that are found in the fragment, exocylic atoms are thus ignored # to_keep = set() # [to_keep.update(ri) for ri in ring_atoms if ri.issubset(aidxf)] # logging.debug("to_keep=%s", to_keep) # return frozenset(to_keep) def _exclude_exocyclic(mol: Mol, aidxf: frozenset) -> frozenset: """Exclude exocylic atoms from a list of atom indices. :param mol: the input molecule :param aidxf: the atom indices :return: the filtered atom indices """ logging.debug("Input aidxf=%s", aidxf) # all ring atoms in the molecule ring_atoms = set([item for row in mol.GetRingInfo().AtomRings() for item in row]) logging.debug("Ring atoms=%s", ring_atoms) # save all ring atoms that are found in the fragment, exocylic atoms are thus ignored to_keep = aidxf.intersection(ring_atoms) logging.debug("to_keep=%s", to_keep) return frozenset(to_keep)
[docs]def classify(mol: Mol, aidxf1: set, aidxf2: set, cutoff: int = 3, exclude_exocyclic: bool = False) -> dict: """Classify a fragment combination found in a molecule as a dictionary with category, type and subtype values. Following algorithm is applied for classifying fragment combinations: .. image:: _images/fragment_tree.png Fragment 1: red; Fragment 2: green; Fused Atoms: yellow. Possible classifications are: - fusion - spiro (fs) - edge (fe) - bridged (fb) - linker (fl) - false_positive - substructure (ffs) - overlap (ffo) - connection - annulated (ca) - monopodal (cm) - bipodal - spiro (cbs) - edge (cbe) - bridged (cbb) - linker (cbl) - tripodal - spiro (cts) - edge (cte) - bridged (ctb) - linker (ctl) - other - spiro (cos) - edge (coe) - bridged (cob) - linker (col) - false_positive - cutoff (cfc) :param mol: the input molecule :param aidxf1: the atom indices of the first fragment found in the molecule :param aidxf2: the atom indices of the second fragment found in the molecule :param cutoff: maximum number of intermediary atoms between 2 fragments to consider them as a combination (labelled as cfc otherwise) :param exclude_exocyclic: exclude exocyclic atoms during classification :return: the dictionary specifying fragment combination category, type and subtype """ logging.debug("cuttoff: %s", cutoff) # use a list of the fragment hit atom indices for labelling connection points from the fragment perspective, not molecule's aidxf1_list = list(aidxf1) aidxf2_list = list(aidxf2) aidxf1 = frozenset(aidxf1) aidxf2 = frozenset(aidxf2) # exclude exocyclic atoms from aidxf1 and 2 during classification if exclude_exocyclic: aidxf1 = _exclude_exocyclic(mol, aidxf1) aidxf2 = _exclude_exocyclic(mol, aidxf2) # get atoms in common in fragment 1 and fragment 2 aidx_fused = aidxf1.intersection(aidxf2) logging.debug("aidx1: %s", aidxf1) logging.debug("aidx2: %s", aidxf2) logging.debug("aidx_fused: %s", aidx_fused) # in case of overlapping fragments in the queries, we get overlapping matches if aidxf1.issubset(aidxf2) or aidxf2.issubset(aidxf1): fcc = 'ffs' logging.debug("Classification: %s", fcc) cp1 = [aidxf1_list.index(x) for x in aidx_fused] cp2 = [aidxf2_list.index(x) for x in aidx_fused] return {'category': 'fusion', 'type': 'false_positive', 'subtype': 'substructure', 'fcc': fcc, 'cp1': cp1, 'cp2': cp2} if len(aidx_fused) > 0: category = 'fusion' logging.debug(f"aidx_fused={aidx_fused}, ") cp1 = [aidxf1_list.index(x) for x in aidx_fused] cp2 = [aidxf2_list.index(x) for x in aidx_fused] if len(aidx_fused) == 1: fcc = 'fs' logging.debug("Classification: %s", fcc) return {'category': category, 'type': 'spiro', 'subtype': '', 'fcc': fcc, 'cp1': cp1, 'cp2': cp2} elif len(aidx_fused) == 2: fcc = 'fe' logging.debug("Classification: %s", fcc) return {'category': category, 'type': 'edge', 'subtype': '', 'fcc': fcc, 'cp1': cp1, 'cp2': cp2} else: sssr = mol.GetRingInfo().AtomRings() # smallest sets of smallest rings for aidxr in sssr: # if at least one ring is completely present in the overlap between fragments, # then it's a false positive due to the fragments overlap qnd not a combination. if set(aidxr).issubset(aidx_fused): fcc = 'ffo' logging.debug("Classification: %s", fcc) return {'category': category, 'type': 'false_positive', 'subtype': 'overlap', 'fcc': fcc, 'cp1': cp1, 'cp2': cp2} # need to check for fbr after ffo since 3-5 atoms might actually define a full ring if 3 <= len(aidx_fused) <= 5: fcc = 'fbr' logging.debug("Classification: %s", fcc) return {'category': category, 'type': 'bridged', 'subtype': '', 'fcc': 'fb', 'cp1': cp1, 'cp2': cp2} else: # linker with > 5 fused atoms! return {'category': category, 'type': 'linker', 'subtype': '', 'fcc': 'fl', 'cp1': cp1, 'cp2': cp2} else: # not fusion so connection category = 'connection' logging.debug("category: %s", category) # need to estimate how far apart the 2 fragments are shortest_path_between_frags = get_shortest_path_between_frags(mol, aidxf1, aidxf2) logging.debug("shortest_path_between_frags: n=%s %s", len(shortest_path_between_frags)-2, shortest_path_between_frags) # if the fragments are too far apart (cut-off), then it is a false positive combination if len(shortest_path_between_frags) - 2 > cutoff: # begin and end atoms are in the shortest path but should not be considered for cutoff fcc = 'cfc' logging.debug("Classification: %s (shortest_path_between_frags: %s > %s)", fcc, len(shortest_path_between_frags) - 2, cutoff) return {'category': category, 'type': 'false_positive', 'subtype': 'cutoff', 'fcc': fcc, 'cp1': [], 'cp2': []} # if the fragments are close enough, have a look at how many intermediary rings connec them intermediary_rings = get_rings_between_two_fragments(mol, aidxf1, aidxf2) logging.debug("intermediary_rings: %s", intermediary_rings) RI = mol.GetRingInfo() # no intermediary rings are found: no direct ring inbetween both fragments if len(intermediary_rings) == 0: # not always monopodal connections, as we detect annulated combinations too ring_bonds = set(itertools.chain.from_iterable(RI.BondRings())) logging.debug("Ring Bonds: %s", ring_bonds) shortest_path_between_frags_inner_bonds = set([b.GetIdx() for i in range(len(shortest_path_between_frags)-1) for b in [mol.GetBondBetweenAtoms(shortest_path_between_frags[i], shortest_path_between_frags[i+1])]]) logging.debug("shortest path inner bonds: %s", shortest_path_between_frags_inner_bonds) # if all bonds of the shortest path are within rings, then the fragments are annulated if shortest_path_between_frags_inner_bonds.issubset(ring_bonds): # I want to find out the CP between each fragment and the ring system inbetween. # To achieve this, I need to: # 1. get the list of all rings met by the shortest path (minus start and end to avoid the ring atoms of the fragments themselves) # 2. fuse these rings # 3. overlap between the fused rings and the fragments are the CPs. sssr = [set(x) for x in RI.AtomRings()] sp = set(shortest_path_between_frags[1:-1]) # consider intermediary atoms only sssr_annulated = [x for x in sssr if x.intersection(sp)] sssr_annulated = set(utils.fuse_rings(sssr_annulated)[0]) # by definition only one big ring cp1 = [aidxf1_list.index(x) for x in sssr_annulated.intersection(aidxf1)] cp2 = [aidxf2_list.index(x) for x in sssr_annulated.intersection(aidxf2)] fcc = 'ca' logging.debug("Classification: %s", fcc) return {'category': category, 'type': 'annulated', 'subtype': '', 'fcc': fcc, 'cp1': cp1, 'cp2': cp2} # place holders # if not, it is a monopodal connection else: fcc = 'cm' logging.debug("Classification: %s", fcc) cp1 = [aidxf1_list.index(shortest_path_between_frags[0])] cp2 = [aidxf2_list.index(shortest_path_between_frags[-1])] return {'category': category, 'type': 'monopodal', 'subtype': '', 'fcc': fcc, 'cp1': cp1, 'cp2': cp2} else: # define what intermediary rings we are talking about sssr = [set(x) for x in RI.AtomRings()] # smallest sets of smallest rings # filter equivalent intermediary rings intermediary_rings = _filter_intermediary_rings(mol, intermediary_rings, sssr) intermediary_rings = _filter_intermediary_rings_smallest_sssr(intermediary_rings, aidxf1_list, aidxf2_list) logging.debug("Number of intermediary rings: %s", len(intermediary_rings)) # attribute the type depending on the number of intermediary rings. 1 ring -> 2 paths (bipodal), 2 rings -> 3 paths (tripodal), >2 rings -> >3 paths (other) # subtype is deduced from the number of atoms in common between each fragment and each intermediary ring if len(intermediary_rings) == 1: type = 'bipodal' # 1 intermediary ring, which are defined by intersection of aidx, so at least always 1 for each fragment! return _get_combination_subtype(category, type, aidxf1_list, aidxf2_list, intermediary_rings) elif len(intermediary_rings) == 2: type = 'tripodal' return _get_combination_subtype(category, type, aidxf1_list, aidxf2_list, intermediary_rings) else: type = 'other' return _get_combination_subtype(category, type, aidxf1_list, aidxf2_list, intermediary_rings)
def _filter_intermediary_rings_smallest_sssr(intermediary_rings: list, aidxf1_list: list, aidxf2_list: list) -> list: """Group intermediary rings by common atoms with respectively fragment 1 or fragment 2 and return only the smallest ring of each group. This is to avoid such cases where cbb are labelled as ctb because of an extra ring found by RDKit: Oc1c2c(c(O)n1-c1cccc(C(F)(F)F)c1)C1CCCC2CC1 (ChEMBL183882) """ aidxf1 = frozenset(aidxf1_list) aidxf2 = frozenset(aidxf2_list) logging.info(f"IR: {intermediary_rings}") df = DataFrame({'IR': intermediary_rings}) df['aidxf1'] = [aidxf1] * len(df) df['aidxf2'] = [aidxf2] * len(df) df['intersect_1'] = df.apply(lambda x: frozenset(x.IR.intersection(x.aidxf1)), axis=1) df['intersect_2'] = df.apply(lambda x: frozenset(x.IR.intersection(x.aidxf2)), axis=1) df['IR_size'] = df['IR'].map(lambda x: len(x)) # put smallest IR first df = df.sort_values('IR_size') logging.debug("IR before grouping by intersection with either fragment:\n%s\n", df) df = df.groupby('intersect_1').first() df = df.groupby('intersect_2').first() return list(df['IR'].values) def _filter_intermediary_rings(mol: Mol, intermediary_rings: list, sssr: list) -> list: """Filter the intermediary rings found within a molecule using the Smallest Set of Smallest Rings (SSSR). The idea is that if two fragments have 2 intermediary rings that are almost identical but for a few atoms, and these atoms actually are contained within the same ring, then only one intermediary ring should be counted. This will lower the amount of bipodal that are being identified as tripodal and tripodal that are identified as unknown connections. :param mol: a molecule :param intermediary_rings: the intermediary rings of a molecule :param sssr: the smallest set of smallest rings of a molecule :return: the filtered intermediary rings of a molecule """ # get a dict of all intermediary rings (ir) with their length as keys d = {} for i, x in enumerate(intermediary_rings): ir_id = f"IR_{str(i).zfill(3)}" if len(x) in d.keys(): # attribute an id to each ir for tracking down d[len(x)].append((x, ir_id)) logging.debug("%s: %s", ir_id, x) else: d[len(x)] = [(x, ir_id)] logging.debug("%s: %s", ir_id, x) # filter the dict so we consider only values with more than one ir of the same size ir_to_check = {} # ir with common lengths for key, val in d.items(): if len(val) > 1: ir_to_check[key] = d[key] # exit this functions if no ir to check if not ir_to_check: return intermediary_rings # continue to investigate IR # display IR to checks for k in ir_to_check.keys(): # display info logging.debug("IR to check with size=%s: %s", k, ', '.join([x[1] for x in ir_to_check[k]])) # attribute id to each SSSR for i in range(len(sssr)): sssr[i] = (sssr[i], f"SSSR_{str(i).zfill(3)}") logging.debug("%s: %s", sssr[i][1], sssr[i][0]) # check each IR of a ggiven n by checking if the atoms that vary to_remove = [] for k in ir_to_check.keys(): to_remove_curr = [] for i in range(len(ir_to_check[k])): for j in range(i+1, len(ir_to_check[k])): logging.debug("Comparing %s and %s", ir_to_check[k][i][1], ir_to_check[k][j][1]) diff = ir_to_check[k][i][0] - ir_to_check[k][j][0] [diff.add(x) for x in ir_to_check[k][j][0] - ir_to_check[k][i][0]] logging.debug("Variant atom indices: %s", diff) # get a dict with idx: atom so we can look up variant atoms neighbors and find out which are connected to each others atoms = {x: mol.GetAtomWithIdx(x) for x in diff} # get the indices of neighbors for every variant atom neighbors = {idx: [x.GetIdx() for x in a.GetNeighbors()] for idx, a in atoms.items()} # get pairwise combinations of variant atoms according to their neighbors combinations = itertools.combinations([(idx1, idx2) for idx1, idx2 in itertools.combinations(neighbors, 2) if idx2 in neighbors[idx1]], 2) # flatten the subtuple so we can consider combinations for every rings combinations = [tuple(itertools.chain.from_iterable(c)) for c in combinations] [logging.debug("Possible combination: %s", c) for c in combinations] # check what combinations are found within a ring of the molecule combinations_identified = [] for c in combinations: combinations_identified += [c for r in sssr if set(c).issubset(r[0])] if combinations_identified == combinations: logging.debug("All combinations were identified, %s and %s are equivalent", ir_to_check[k][i][1], ir_to_check[k][j][1]) to_remove_curr.append(ir_to_check[k][i]) to_remove_curr.append(ir_to_check[k][j]) to_remove_curr = [(frozenset(tr[0]), tr[1]) for tr in to_remove_curr] # remove duplicates while keeping the order consistant (previous implementation with sets) # 1/ remove duplicates to_remove_curr = list(set(to_remove_curr)) # 2/ sort remaining by id to_remove_curr.sort(key=lambda x: x[1]) # get ids for debug display to_remove_ids = [tr[1] for tr in to_remove_curr] logging.debug("IR to remove for n=%s: %s", k, ', '.join([tri for tri in to_remove_ids])) # to_remove_curr = [tr[0] for tr in to_remove_curr] # get rid of the ids # in case all ir of this size are equivalent, just retrieve the first one if len(to_remove_curr) == len(ir_to_check[k]): logging.debug("All IR were detected equivalent, so keeping %s", to_remove_curr[0][1]) to_remove_curr.pop(0) # add current k to the whole mask to_remove += to_remove_curr to_remove_ids = list(set([tr[1] for tr in to_remove])) logging.debug("Total IR to remove: %s", ', '.join([tri for tri in to_remove_ids])) # clear to_remove from ids for easier comparison (maybe lambda funct would perform better here?) to_remove = [set(tr[0]) for tr in to_remove] # filter the IR by to_remove remaining_ir = [ir for ir in intermediary_rings if frozenset(ir) not in to_remove] logging.debug("Number of remaining_ir: %s", len(remaining_ir)) # we don't have the ids here # return the filtered IR return remaining_ir def _get_combination_subtype(category: str, type: str, aidxf1_list: list, aidxf2_list: list, intermediary_rings: list) -> tuple: """Return the subtype (spiro, edge, bridged) for bipodal, tripodal and unknown connections. :param category: the fragment combination category (connection) :param type: the fragment combination type (bipodal, tripodal or unknown) :param aidxf1: the atom indices of the first fragment found in the molecule :param aidxf2: the atom indices of the second fragment found in the molecule :param intermediary_rings: the list of intermediary rings between both fragments and defined by atom indices :return: the dictionary specifying fragment combination category, type, subtype and fcc """ logging.debug("Iterating over remaining IR (new ids and will continue iteration only if subtype=edge):") ir_ids = [f"IR_{str(i).zfill(3)}" for i in range(len(intermediary_rings))] fcc = category[0] + type[0] linker = False bridged = False spiro = False cp1 = [] cp2 = [] for i, ir in enumerate(intermediary_rings): logging.debug("IR#{i} => %s: %s", ir_ids[i], intermediary_rings[i]) intersect_1 = ir.intersection(set(aidxf1_list)) intersect_2 = ir.intersection(set(aidxf2_list)) cp1 += [aidxf1_list.index(x) for x in intersect_1] cp2 += [aidxf2_list.index(x) for x in intersect_2] logging.debug("intersect_1: %s (%s), intersect_2: %s (%s)", intersect_1, len(intersect_1), intersect_2, len(intersect_2)) if len(intersect_1) > 5 or len(intersect_2) > 5: logging.debug("Subtype: linker") linker = True elif len(intersect_1) == 1 or len(intersect_2) == 1: logging.debug("Subtype: spiro") spiro = True elif 3 <= len(intersect_1) <= 5 or 3 <= len(intersect_2) <= 5: logging.debug("Subtype: bridged") bridged = True # else: it is an edge # linker has the 1st priority if linker: fcc += 'l' logging.debug("Classification: %s", fcc) return {'category': category, 'type': type, 'subtype': 'spiro', 'fcc': fcc, 'cp1': list(set(cp1)), 'cp2': list(set(cp2))} # spiro has 2nd highest priority if spiro: fcc += 's' logging.debug("Classification: %s", fcc) return {'category': category, 'type': type, 'subtype': 'spiro', 'fcc': fcc, 'cp1': list(set(cp1)), 'cp2': list(set(cp2))} # bridged has 3rd hihghest priority if bridged: fcc += 'b' logging.debug("Classification: %s", fcc) return {'category': category, 'type': type, 'subtype': 'bridged', 'fcc': fcc, 'cp1': list(set(cp1)), 'cp2': list(set(cp2))} # edge has lowest priority fcc += 'e' logging.debug("Classification: %s", fcc) return {'category': category, 'type': type, 'subtype': 'edge', 'fcc': fcc, 'cp1': list(set(cp1)), 'cp2': list(set(cp2))}
[docs]def classify_df(df_aidxf: DataFrame, cutoff: int = 3, clear_cfc: bool = True, exclude_exocyclic: bool = False) -> DataFrame: """Return a DataFrame with all fragment combination categories for a given set of molecules and fragment atom indices obtained by substructure search. For more details about category, type and subtype, see doc in method classify_fragment_combination. The output DataFrame contains 8 columns decribing each fragment combination: 1) idm: the id of the molecule 2) idf1: the id of fragment 1 3) idf2: the id of fragment 2 4) fcc: a 3-letter code indicating category, type and subtype 5) category 6) type 7) subtype 8) aidxf1: the atom indices of fragment 1 found in the molecule 9) aidxf2: the atom indices of fragment 2 found in the molecule Fragments with a number of intermediary atoms higher than defined cutoff are labelled as false positives. :param df_aidxf: the input DataFrame with substructure matches :param cutoff: the maximum number of intermediary atoms between 2 fragments :param clear_cfc: remove cfc combinations (false positives) from results :param exclude_exocyclic: exclude exocylic atoms from fragment atom indices (during classification only) :return: a DataFrame with all fragment combination classifications """ ds_fcc = [] logging.debug("columns in df_aidxf: %s", df_aidxf.columns) # labelling idxf df_aidxf['aidxf_str'] = df_aidxf['_aidxf'].map(str) # sets are an unhashable type... # logging.info(f"\n\ndf_aidxf['idf_idx']:\n {df_aidxf['idf_idx']}") # !!! think about what I really want here. For now I just know I don't want this behavior # classify fragment combinations for gid, g in df_aidxf.groupby('idm'): mol = g.iloc[0]['mol'] hac = mol.GetNumAtoms() # so we can estimate how well-covered is the molecule by its fragment combinations # mol = df_mols[df_mols['idm'] == gid]['mol'].iloc[0] for i in range(len(g)): row_f1 = g.iloc[i] aidxf1 = row_f1['_aidxf'] idf1 = row_f1['idf'] idf1_idx = row_f1['idf_idx'] molf1 = row_f1['mol_frag'] for j in range(i+1, len(g)): row_f2 = g.iloc[j] aidxf2 = row_f2['_aidxf'] idf2 = row_f2['idf'] idf2_idx = row_f2['idf_idx'] molf2 = row_f2['mol_frag'] logging.debug("="*80) logging.debug("Classifying m=%s, f1=%s:%s, f2=%s:%s", gid, idf1, idf1_idx, idf2, idf2_idx) logging.debug("aidxf1=%s", aidxf1) logging.debug("aidxf2=%s", aidxf2) d_fcc = classify(mol, aidxf1, aidxf2, cutoff=cutoff, exclude_exocyclic=exclude_exocyclic) if len(d_fcc['cp1']) > 0: # here I need to find the corresponding cp1 but with fcp labels try: fcp1 = [row_f1['_fcp_labels'][x] for x in sorted(d_fcc['cp1'])] fcp2 = [row_f2['_fcp_labels'][x] for x in sorted(d_fcc['cp2'])] except KeyError: logging.warning('FCP labels are not available') fcp1 = sorted(d_fcc['cp1']) fcp2 = sorted(d_fcc['cp2']) fc = f"{idf1}:{idf1_idx}@{','.join([str(x) for x in fcp1])}[{d_fcc['fcc']}]{idf2}:{idf2_idx}@{','.join([str(x) for x in fcp2])}" else: # useful only for cfc combinations fc = f"{idf1}:{idf1_idx}[{d_fcc['fcc']}]{idf2}:{idf2_idx}" logging.debug(f"fc={fc}") # record fragment combination d_fcc['idm'] = gid d_fcc['mol'] = mol d_fcc['inchikey'] = row_f1['inchikey'] d_fcc['idf1'] = idf1 d_fcc['idf1_idx'] = idf1_idx d_fcc['fid1'] = str(idf1) + ":" + str(idf1_idx) d_fcc['idf2'] = idf2 d_fcc['idf2_idx'] = idf2_idx d_fcc['fid2'] = str(idf2) + ":" + str(idf2_idx) d_fcc['_aidxf1'] = aidxf1 d_fcc['_aidxf2'] = aidxf2 d_fcc['hac'] = hac d_fcc['mol_frag_1'] = molf1 d_fcc['mol_frag_2'] = molf2 try: d_fcc['_fcp_labels_1'] = row_f1['_fcp_labels'] d_fcc['_fcp_labels_2'] = row_f2['_fcp_labels'] except KeyError: d_fcc['_fcp_labels_1'] = {} d_fcc['_fcp_labels_1'] = {} d_fcc['fc'] = fc ds_fcc.append(d_fcc) logging.debug("="*80) # dataframe with columns in given order df_fcc = DataFrame(ds_fcc, columns=['idm', 'inchikey', 'idf1', 'idf1_idx', 'fid1', 'idf2', 'idf2_idx', 'fid2', 'fcc', 'category', 'type', 'subtype', '_aidxf1', '_aidxf2', 'hac', 'mol', 'mol_frag_1', 'mol_frag_2', 'fc', '_fcp_labels_1', '_fcp_labels_2']) # clear_cfc if clear_cfc: df_fcc = df_fcc[df_fcc['fcc'] != 'cfc'] return df_fcc