import json
import pandas as pd
import logging, sys
from choppa.data.toy_data.resources import TOY_PHYLO_DATA
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger()
[docs]
def phylo_json_to_df(json_file, gene=None):
"""
Converts a dataset of phylogenetics fitness data formatted in JSON into
a table format as DataFrame as used in `choppa.IO.input`. This is prone to
breaking (depending on how the JSON is formatted). Ideally the JSON
has been generated with https://github.com/jbloomlab/SARS2-mut-fitness/blob/main/scripts/export_fitness_to_json.py
`gene` can be specified to only export a specific gene into the dataframe.
"""
fitness_df = pd.DataFrame(json.load(open(json_file))["data"])
if gene:
print(f"Available genes: {set(fitness_df['gene'].values)}")
fitness_df_for_gene = fitness_df[fitness_df["gene"] == gene]
fitness_df_for_gene = fitness_df_for_gene[
fitness_df_for_gene["site"].between(207, 379)
]
if len(fitness_df_for_gene) == 0:
raise ValueError(f"Gene '{gene}' not found in data:\n{fitness_df}")
else:
logger.info(
f"Extracted {len(fitness_df_for_gene)} entries across {len(fitness_df_for_gene.groupby(by='site'))} sites during JSON->DataFrame conversion"
)
return fitness_df_for_gene
else:
return fitness_df
[docs]
def nextstrain_to_csv(nextstrain_tsv):
""" """
if __name__ == "__main__":
phylo_json_to_df(TOY_PHYLO_DATA, "N").to_csv("sars2_fitness.csv", index=False)