Source code for choppa.IO.input

import pandas as pd
from typing import Optional
from pathlib import Path
import numpy as np
import logging, sys
from collections import OrderedDict

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger()



[docs]
class FitnessFactory:
    """
    Base class for handling Fitness data within `choppa`.
    """


[docs]
    def __init__(
        self,
        input_fitness_csv: Path,
        resindex_colname: Optional[str] = "residue_index",
        wildtype_colname: Optional[str] = "wildtype",
        mutant_colname: Optional[str] = "mutant",
        fitness_colname: Optional[str] = "fitness",
        confidence_colname: Optional[str] = None,
    ):
        self.input_fitness_csv = input_fitness_csv
        self.resindex_colname = resindex_colname
        self.wildtype_colname = wildtype_colname
        self.mutant_colname = mutant_colname
        self.fitness_colname = fitness_colname
        self.confidence_colname = confidence_colname
        if self.confidence_colname is not None:
            # makes it easier to track whether confidence values are provided or set to NaN by us
            self.confidence_set = True
        else:
            self.confidence_set = False
        self.fitness_df = None



[docs]
    def check_validity(self, fitness_df):
        """
        Does some quick checks to make sure the imported CSV file is valid.
        """
        if self.confidence_colname not in fitness_df.columns:
            raise KeyError(
                f"Column {self.confidence_colname} not found in {self.input_fitness_csv}"
            )

        # quick check for columns
        non_confidence_columns = [
            self.resindex_colname,
            self.wildtype_colname,
            self.mutant_colname,
            self.fitness_colname,
        ]
        missing_columns = []
        for colname in non_confidence_columns:
            if not colname in fitness_df.columns:
                missing_columns.append(colname)
        if missing_columns:
            raise KeyError(
                f"Column(s) {missing_columns} not found in {self.input_fitness_csv}"
            )

        # keep only the requested columns
        fitness_df = fitness_df[non_confidence_columns + [self.confidence_colname]]

        # if residue_index is a float (ran into this use-case), convert to int.
        fitness_df[self.resindex_colname] = fitness_df[self.resindex_colname].apply(
            np.int64
        )

        # check that there aren't any NaNs and that fitness (and confidence) data is scalar
        if fitness_df[non_confidence_columns].isnull().values.any():
            raise ValueError(
                f"Found missing values in input CSV: {fitness_df[fitness_df[non_confidence_columns].isnull().any(axis=1)]}"
            )
        if (
            len(
                fitness_df[
                    pd.to_numeric(
                        fitness_df[self.fitness_colname], errors="coerce"
                    ).isnull()
                ]
            )
            > 0
        ):
            raise ValueError(
                f"Found non-numeric fitness values in input CSV: {fitness_df[pd.to_numeric(fitness_df[self.fitness_colname], errors='coerce').isnull()]}"
            )
        if self.confidence_colname is not None:
            if self.confidence_set:
                if (
                    len(
                        fitness_df[
                            pd.to_numeric(
                                fitness_df[self.confidence_colname], errors="coerce"
                            ).isnull()
                        ]
                    )
                    > 0
                ):
                    raise ValueError(
                        f"Found non-numeric confidence values in input CSV: {fitness_df[pd.to_numeric(fitness_df[self.confidence_colname], errors='coerce').isnull()]}"
                    )

        # if checks reach this point then the input data should be correctly formatted. Rename and adopt.
        fitness_df = fitness_df.rename(
            columns={
                self.resindex_colname: "residue_index",
                self.wildtype_colname: "wildtype",
                self.mutant_colname: "mutant",
                self.fitness_colname: "fitness",
            }
        )
        self.fitness_df = fitness_df
        return True



[docs]
    def read_fitness_csv(self):
        """
        Reads in a fitness CSV file and checks that all requested columns are present, complete and numeric.
        """
        # read in the complete fitness data. This may have many columns
        logger.info(f"Reading in fitness data from {self.input_fitness_csv}")
        fitness_df = pd.read_csv(self.input_fitness_csv)

        # if no confidence is provided, just set to NaN
        if self.confidence_colname is None:
            fitness_df["confidence"] = np.nan
            self.confidence_colname = "confidence"

        # check whether the CSV file is correct
        if self.check_validity(fitness_df):
            logger.info(f"Successfully read fitness data:\n{self.fitness_df}")
            return self.fitness_df



[docs]
    def get_fitness_basedict(self):
        """
        Converts a `pandas` fitness dataframe (read by `FitnessFactory.read_fitness_csv`) into
        a `fitness basedict` which is essentially just an `OrderedDict`.

        We want the dict to have the form:
        {
        residue_index: {
                fitness_csv_index, # this is the original index in the fitness CSV for provenance
                wildtype: {AA, fitness, confidence},
                mutants: [{AA, fitness, confidence}, {AA, fitness, confidence}, etc]
            }
        }

        """
        fitness_basedict = OrderedDict()

        for residx, res_df in self.read_fitness_csv().groupby(by="residue_index"):

            # first construct the dict entry for the wildtype
            wildtype = res_df["wildtype"].values[0]

            # if there is no wildtype mutation available, the experimentalists have omitted it and set
            # the value implicitly as 0.
            if len(res_df[res_df.mutant == wildtype]) == 0:
                res_df.loc[-1] = [
                    res_df["residue_index"].values[0],
                    res_df["wildtype"].values[0],
                    res_df["wildtype"].values[0],  # this is where we add the wildtype
                    0.0,
                    res_df["confidence"].values[0],
                ]

            wt_dict = {
                "aa": wildtype,
                "fitness": res_df[res_df.mutant == wildtype]["fitness"].values[0],
                "confidence": res_df[res_df.mutant == wildtype]["confidence"].values[0],
            }
            # now construct the mutant list of dict entries for this residue index while excluding wildtype
            mutant_list = []
            for mut, fitness, conf in res_df[
                [
                    "mutant",
                    "fitness",
                    "confidence",
                ]
            ].values:
                if not mut == wildtype:  # ignore wildtype entry
                    mutant_list.append(
                        {"aa": mut, "fitness": fitness, "confidence": conf}
                    )

            fitness_basedict[residx] = {
                "fitness_csv_index": residx,  # double now, but will be helpful for provenance after alignment
                "wildtype": wt_dict,
                "mutants": mutant_list,
            }

        logger.info(
            f"Created fitness dictionary as `FitnessFactory` of length {len(fitness_basedict)}"
        )
        return fitness_basedict




from Bio.PDB import PDBParser
from rdkit import Chem



[docs]
class ComplexFactory:
    """ """


[docs]
    def __init__(
        self,
        path_to_pdb_file: Path,
    ):
        self.path_to_pdb_file = path_to_pdb_file



[docs]
    def remove_waters(system):
        """[Placeholder] Returns a system with water entries removed"""
        return system



[docs]
    def extract_ligands(system):
        """[Placeholder] Returns a system's ligands"""
        return system



[docs]
    def check_validity(self, complex):
        """
        [Placeholder] Does some quick checks to make sure the imported PDB structure is valid. We're
        not doing any kind of protein prep, just whether biopython _is able to_ read the PDB
        file and we try to figure out what entry names the solvent/ligands have (if there are any)
        """
        return complex



[docs]
    def load_pdb(self):
        """
        Loads an input PDB file
        """
        complex = PDBParser(QUIET=False).get_structure("COMPLEX", self.path_to_pdb_file)

        self.check_validity(complex)
        return complex



[docs]
    def load_pdb_rdkit(self):
        """
        Loads an input PDB file to an RDKit object for easier string retrieval
        """
        return Chem.MolFromPDBFile(self.path_to_pdb_file, sanitize=False)




if __name__ == "__main__":
    from choppa.data.toy_data.resources import (
        TOY_COMPLEX,
        TOY_FITNESS_DATA_COMPLETE,
        TOY_FITNESS_DATA_COMPLETE_NOCONF,
    )

    fitness_df = FitnessFactory(
        TOY_FITNESS_DATA_COMPLETE, confidence_colname="confidence"
    ).get_fitness_basedict()
    complex = ComplexFactory(TOY_COMPLEX).load_pdb()