Skip to content

observational_extracted

combined_summaries(df)

Generate combined summaries of infection data using different sampling schemes.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame. Must contain columns: - 'effective_coi' (numeric, computed beforehand) - 'genome_ids' (list-like; if not, parse first with parse_list) - 'cotransmission' (Boolean) - Time frame columns (default: 'year' and 'month')

required

Returns:

Type Description

pd.DataFrame: A combined summary DataFrame with different sampling schemes.

Source code in fpg_observational_model/plotting_code/observational_extracted.py
def combined_summaries(df):
    """
    Generate combined summaries of infection data using different sampling schemes.

    Parameters:
      df (pd.DataFrame): Input DataFrame. Must contain columns:
                         - 'effective_coi' (numeric, computed beforehand)
                         - 'genome_ids' (list-like; if not, parse first with parse_list)
                         - 'cotransmission' (Boolean)
                         - Time frame columns (default: 'year' and 'month')

    Returns:
      pd.DataFrame: A combined summary DataFrame with different sampling schemes.
    """
    # Summarize using different sampling schemes
    input_n_sample = 100
    summary_all_yearly=summarize_infections(df, groupby_cols=['year']).assign(sampling_scheme='All - Yearly')
    summary_all_monthly=summarize_infections(df, groupby_cols=['year', 'month', 'continuous_month']).assign(sampling_scheme='All - Monthly')
    summary_yearly_proportionally = summarize_infections(df, groupby_cols=['year'], sample_n=input_n_sample).assign(sampling_scheme='Sample - Proportional')
    summary_yearly_evenly = summarize_infections(df, groupby_cols=['year'], sample_n=input_n_sample, sample_proportionally=False).assign(sampling_scheme='Sample - Even')
    summary_yearly_seasonally = summarize_infections(df, groupby_cols=['season'], sample_n=input_n_sample).assign(sampling_scheme='Sample - Seasonal')
    summary_yearly_peaks = summarize_infections(df, groupby_cols=['peak_season'], sample_n=input_n_sample).assign(sampling_scheme='Sample - Peak Seasonal')

    combined_df = pd.concat([summary_all_yearly, summary_all_monthly, \
        summary_yearly_proportionally, summary_yearly_evenly, \
        summary_yearly_seasonally, summary_yearly_peaks], 
    axis=0)

    return(combined_df)

parse_list(s)

Converts a string representation of a list into an actual Python list. If conversion fails, returns an empty list.

Source code in fpg_observational_model/plotting_code/observational_extracted.py
def parse_list(s):
    """
    Converts a string representation of a list into an actual Python list.
    If conversion fails, returns an empty list.
    """
    try:
        return ast.literal_eval(s)
    except Exception:
        return []

process_file(row, output_summary_dir, reassign_intervention_time=True)

Process a single file and write the summary output.

Parameters:

Name Type Description Default
row Series

A row from the file list DataFrame.

required
output_summary_dir str

Folder where output files will be saved.

required

Returns:

Name Type Description
str

The path to the written summary file.

Source code in fpg_observational_model/plotting_code/observational_extracted.py
def process_file(row, output_summary_dir, reassign_intervention_time=True):
    """Process a single file and write the summary output.

    Parameters:
      row (pd.Series): A row from the file list DataFrame.
      output_summary_dir (str): Folder where output files will be saved.

    Returns:
      str: The path to the written summary file.
    """
    output_name = row['output_name']
    # Construct the full path to the input file.
    input_file = os.path.join(row['input_dir'], "infIndexRecursive-genomes-df.csv")

    # Read and process the input file.
    df = pd.read_csv(input_file)
    df['continuous_month'] = (df["year"]) * 12 + df["month"] + 1
    if reassign_intervention_time:
        df = reassign_by_intervention(df)
    df = process_genetic_data(df)
    run_summary = combined_summaries(df)

    # Construct output file path.
    output_file = os.path.join(output_summary_dir, f"{output_name}_summary.csv")
    run_summary.to_csv(output_file, index=False)

    return output_file

process_genetic_data(df)

Processes the DataFrame by
  1. Parsing the 'genome_ids' column and computing:
  2. true_coi: the total count of items in genome_ids.
  3. effective_coi: the count of unique items in genome_ids.
  4. Parsing the 'bite_ids' column and computing:
  5. cotransmission: a Boolean indicating if all items in bite_ids are unique.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame containing at least 'genome_ids' and 'bite_ids' columns as strings.

required

Returns:

Type Description

pd.DataFrame: The modified DataFrame with additional computed columns.

Source code in fpg_observational_model/plotting_code/observational_extracted.py
def process_genetic_data(df):
    """
    Processes the DataFrame by:
      1. Parsing the 'genome_ids' column and computing:
         - true_coi: the total count of items in genome_ids.
         - effective_coi: the count of unique items in genome_ids.
      2. Parsing the 'bite_ids' column and computing:
         - cotransmission: a Boolean indicating if all items in bite_ids are unique.

    Parameters:
      df (pd.DataFrame): Input DataFrame containing at least 'genome_ids' and 'bite_ids' columns as strings.

    Returns:
      pd.DataFrame: The modified DataFrame with additional computed columns.
    """

    # Subsample one represenative infection per person per year
    df = df.groupby(['IndividualID', 'year']).sample(n=1, random_state=input_seed)
    df['month'] = df['month'] + 1

    # 1. Process the 'genome_ids' column
    df["genome_ids"] = df["genome_ids"].apply(parse_list)
    df["true_coi"] = df["genome_ids"].apply(len)
    df["effective_coi"] = df["genome_ids"].apply(lambda x: len(set(x)))

    # 2. Process the 'bite_ids' column 
    df["bite_ids"] = df["bite_ids"].apply(parse_list)
    df["superinfection"] = df["bite_ids"].apply(lambda x: len(set(x)) > 1)
    df["cotransmission"] = df["bite_ids"].apply(lambda x: len(set(x)) == 1)

    df["season"] = df.apply(assign_season_group, axis=1)
    df["peak_season"] = df.apply(assign_peak_group, axis=1)

    return df

summarize_infections(df, groupby_cols=['year', 'month'], sample_n=None, sample_proportionally=True, sample_seasons=False, seed=input_seed)

Group and summarize infection data by a given time frame.

For each group (e.g., by year and month), computes: 1. Total rows, count and proportion of rows with effective_coi > 1. 2. Total and unique counts (and the proportion) of genome_ids (after flattening the lists from all rows). 3. Count and proportion for rows where cotransmission and superinfection is True.

Optionally, a random sample of rows can be taken before grouping.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame. Must contain columns: - 'effective_coi' (numeric, computed beforehand) - 'genome_ids' (list-like; if not, parse first with parse_list) - 'cotransmission' (Boolean) - Time frame columns (default: 'year' and 'month')

required
groupby_cols list

List of column names to group by. Defaults to ['year', 'month'].

['year', 'month']
sample_n int or None

If provided, randomly sample n rows from the DataFrame before summarizing.

None
seed int

Random seed for sampling.

input_seed
Source code in fpg_observational_model/plotting_code/observational_extracted.py
def summarize_infections(df, groupby_cols=['year', 'month'], 
    sample_n=None, sample_proportionally=True, sample_seasons=False,
    seed=input_seed):
    """
    Group and summarize infection data by a given time frame.

    For each group (e.g., by year and month), computes:
      1. Total rows, count and proportion of rows with effective_coi > 1.
      2. Total and unique counts (and the proportion) of genome_ids 
         (after flattening the lists from all rows).
      3. Count and proportion for rows where cotransmission and superinfection is True.

    Optionally, a random sample of rows can be taken before grouping.

    Parameters:
      df (pd.DataFrame): Input DataFrame. Must contain columns:
                         - 'effective_coi' (numeric, computed beforehand)
                         - 'genome_ids' (list-like; if not, parse first with parse_list)
                         - 'cotransmission' (Boolean)
                         - Time frame columns (default: 'year' and 'month')
      groupby_cols (list): List of column names to group by. Defaults to ['year', 'month'].
      sample_n (int or None): If provided, randomly sample n rows from the DataFrame before summarizing.
      seed (int, optional): Random seed for sampling.

    """
    # Optionally, subsample the DataFrame.
    if sample_n is not None:
        if sample_proportionally:
            if not sample_seasons:
                # Group by 'year'; each group will sample up to sample_n rows, but if the group
                # has fewer than sample_n rows, it will take all available rows.
                df = df.groupby(['year'], group_keys=False).apply(lambda grp: grp.sample(n=min(len(grp), sample_n), random_state=seed))
            else:
                sample_season = math.floor(sample_n/2)
                df = df.groupby(['season'], group_keys=False).apply(lambda grp: grp.sample(n=min(len(grp), sample_season), random_state=input_seed))
        else:
            # Group by both 'year' and 'month'
            sample_monthly = math.floor(sample_n/12)
            df = df.groupby(['year', 'month'], group_keys=False).apply(lambda grp: grp.sample(n=min(len(grp), sample_monthly), random_state=input_seed))

    def group_summary(group):
        n = len(group)
        # Count poly_coi: rows with effective_coi > 1.
        poly_count = (group['effective_coi'] > 1).sum()
        poly_prop = poly_count / n if n > 0 else None

        # effective coi mean
        effective_coi_mean = group['effective_coi'].mean()

        # Flatten the genome_ids lists from all rows in the group.
        all_genome_ids = [gid for sublist in group['genome_ids'] 
                                 if isinstance(sublist, list)
                                 for gid in sublist]
        total_genome = len(all_genome_ids)
        unique_genome = len(set(all_genome_ids))
        unique_prop = unique_genome / total_genome if total_genome > 0 else None

        # Count cotransmission True rows.
        polygenomic = group[group['effective_coi'] > 1]
        cotrans_count = polygenomic['cotransmission'].sum()  # Assuming boolean where True==1, False==0
        cotrans_prop = cotrans_count / poly_count if poly_count > 0 else None

        supertrans_count = polygenomic['superinfection'].sum()
        supertrans_prop = supertrans_count / poly_count if poly_count > 0 else None

        return pd.Series({
            'n_infections': n,
            'poly_coi_count': poly_count,
            'poly_coi_prop': poly_prop,
            'coi_mean': effective_coi_mean,
            'genome_ids_total_count': total_genome,
            'genome_ids_unique_count': unique_genome,
            'genome_ids_unique_prop': unique_prop,
            'cotransmission_count': cotrans_count,
            'cotransmission_prop': cotrans_prop,
            'superinfection_count': supertrans_count,
            'superinfection_prop': supertrans_prop
        })

    summary_df = df.groupby(groupby_cols).apply(group_summary).reset_index()
    return summary_df