Source code for BFAIR.mfa.INCA.INCA_input_parser

"""INCA input parser.
Methods to prepare input data to fit the BFAIR INCA tools format.
"""

import cobra
import pandas as pd
from molmass.molmass import Formula


[docs]def parse_cobra_model(model_file_name, model_id, date):
    """
    Parses reaction- and metabolite information out of a cobra model saved
    as a .json or .sbml file and makes it compatible with the
    BFAIR.INCA tools

    Parameters
    ----------
    model_file_name : str or path + str
        Filename or path to file + filename of the cobra metabolic model
    model_id : str
        Name of the model (for downstream reference)
    date : str
        Date of model processing (for downstream reference)

    Returns
    -------
    model_data : pandas.DataFrame
        General information about the processed metabolic model
    reaction_data : pandas.DataFrame
        Information about the reactions in the metabolic model
    metabolite_data : pandas.DataFrame
        Information about the metabolites in the metabolic model

    Raises
    ------
    FileTypeError
        File provided is not a .json or a .sbml file
    """
    cobra_model = None
    # check for the file type
    if ".json" in model_file_name:
        filetype = "json"
        # Read in the json file
        cobra_model = cobra.io.load_json_model(model_file_name)
        (
            model_data,
            reaction_data,
            metabolite_data,
        ) = _parse_json_sbml_cobra_model(
            cobra_model, model_id, date, model_file_name, filetype
        )
    elif ".sbml" in model_file_name:
        filetype = "sbml"
        # Read in the sbml file and define the model conditions
        cobra_model = cobra.io.read_sbml_model(model_file_name)
        (
            model_data,
            reaction_data,
            metabolite_data,
        ) = _parse_json_sbml_cobra_model(
            cobra_model, model_id, date, model_file_name, filetype
        )
    else:
        raise TypeError("File type not supported, must be'.json' or '.sbml'.")
    return model_data, reaction_data, metabolite_data


def _parse_json_sbml_cobra_model(
    cobra_model, model_id, date, model_file_name, filetype
):
    """
    Helper function for parse_cobra_model(), parses reaction- and metabolite
    information out of an already loaded cobra model

    Parameters
    ----------
    cobra_model : cobra.Model
        Cobra metabolic model as loaded by the file type specific import
        function
    model_id : str
        Name of the model (for downstream reference)
    date : str
        Date of model processing (for downstream reference)
    model_file_name : str or path + str
        Filename or path to file + filename of the cobra metabolic model
    filetype : str
        Extension of the provided file

    Returns
    -------
    model_data : pandas.DataFrame
        General information about the processed metabolic model
    reaction_data : pandas.DataFrame
        Information about the reactions in the metabolic model
    metabolite_data : pandas.DataFrame
        Information about the metabolites in the metabolic model
    """
    # Pre-process the model file information
    with open(model_file_name, "r", encoding="utf-8") as f:
        model_file = f.read()
    # parse out model data
    model_data = pd.DataFrame(
        {
            "model_id": model_id,
            "date": date,
            "model_description": cobra_model.description,
            "model_file": model_file,
            "file_type": filetype,
        },
        index=[0],
    )
    # parse out reaction data
    reaction_data_temp = {}
    for cnt, r in enumerate(cobra_model.reactions):
        reaction_data_dict = {
            "model_id": model_id,
            "rxn_id": r.id,
            "rxn_name": r.name,
            "equation": r.build_reaction_string(),
            "subsystem": r.subsystem,
            "gpr": r.gene_reaction_rule,
            "genes": [g.id for g in r.genes],
            "reactants_stoichiometry": [
                r.get_coefficient(react.id) for react in r.reactants
            ],
            "reactants_ids": [react.id for react in r.reactants],
            "products_stoichiometry": [
                r.get_coefficient(prod.id) for prod in r.products
            ],
            "products_ids": [prod.id for prod in r.products],
            "lower_bound": r.lower_bound,
            "upper_bound": r.upper_bound,
            "objective_coefficient": r.objective_coefficient,
            "flux_units": "mmol*gDW-1*hr-1",
            "reversibility": r.reversibility,
            "used_": True,
        }
        reaction_data_temp[cnt] = reaction_data_dict
    reaction_data = pd.DataFrame.from_dict(reaction_data_temp, "index")
    # parse out metabolite data
    metabolite_data_tmp = {}
    for cnt, met in enumerate(cobra_model.metabolites):
        # Pre-process formulas using FIA-MS database methods
        if is_valid(met):
            formula = Formula(met.formula)
            formula = str(formula)
        else:
            formula = None
        # set up part of temp dict to transform into df later
        metabolite_data_dict = {
            "model_id": model_id,
            "met_name": met.name,
            "met_id": met.id,
            "formula": formula,
            "charge": met.charge,
            "compartment": met.compartment,
            "bound": met._bound,
            "annotations": met.annotation,
            "used_": True,
        }
        metabolite_data_tmp[cnt] = metabolite_data_dict
    metabolite_data = pd.DataFrame.from_dict(metabolite_data_tmp, "index")

    return model_data, reaction_data, metabolite_data


def is_valid(metabolite, INVALID_FORMULA_STR=["(", "Generic", "R", "X"]):
    """
    The validity of the input metabolites is checked. It's invalid if it
    does not have any formula annotated or if the formula includes previously
    defined invalid symbols
    Svetlana Kutozova wrote this function

    Parameters
    ----------
    metabolite : cobra.Metabolite
        A metabolite in the cobra format including additional information
        (formula, charge, elements)
    INVALID_FORMULA_STR : list
        A list of previously defined invalid symbol strings
    Returns
    -------
    boolean
        Valid or invalid metabolite
    """
    if not metabolite.formula:
        return False
    for string in INVALID_FORMULA_STR:
        if string in metabolite.formula:
            return False
    return True