Source code for emodpy_hiv.plotting.plot_hiv_by_age_and_gender

import os
import pandas as pd
from typing import Union

import emodpy_hiv.demographics.un_world_pop as unwp

import emodpy_hiv.plotting.xy_plot as xy_plot
import emodpy_hiv.plotting.helpers as helpers


TEST_include_dir_or_filename = True

COL_NAME_YEAR           = "Year"                         # noqa: E221
COL_NAME_NODE_ID        = " NodeId"                      # noqa: E221
COL_NAME_GENDER         = " Gender"                      # noqa: E221
COL_NAME_HAS_HIV        = " HasHIV"                      # noqa: E221
COL_NAME_RISK           = " IP_Key:Risk"                 # noqa: E221
COL_NAME_ACCESS         = " IP_Key:Accessibility"        # noqa: E221
COL_NAME_STATE          = " IP_Key:CascadeState"         # noqa: E221
COL_NAME_IS_CIRC        = " IsCircumcised"               # noqa: E221
COL_NAME_AGE            = " Age"                         # noqa: E221
COL_NAME_POP            = " Population"                  # noqa: E221
COL_NAME_INFECTED       = " Infected"                    # noqa: E221
COL_NAME_NEW_INF        = " Newly Infected"              # noqa: E221
COL_NAME_ON_ART         = " On_ART"                      # noqa: E221
COL_NAME_DIED           = " Died"                        # noqa: E221
COL_NAME_DIED_HIV       = " Died_from_HIV"               # noqa: E221
COL_NAME_TESTED_OR_ART  = " Tested Past Year or On_ART"  # noqa: E221
COL_NAME_TESTED_EVER    = " Tested Ever"                 # noqa: E221
COL_NAME_DIAGNOSED      = " Diagnosed"                   # noqa: E221
COL_NAME_NEW_TESTED_POS = " Newly Tested Positive"
COL_NAME_NEW_TESTED_NEG = " Newly Tested Negative"



[docs]
def create_title(base_title: str = "",
                 node_id: int = None,
                 gender: str = None,
                 show_avg_per_run: bool = False,
                 show_fraction: bool = False,
                 show_fraction_of: bool = False,
                 fraction_of_str: str = "",
                 hiv_negative: bool = False,
                 has_age_bins: bool = False):
    """
    Use the input arguments to create a title for the plot (and filename).

    Args:
        base_title (str, optional):
            This is the core string to place in the title.  It describes the specific data being plotted.

        node_id (int, optional):
            The ID of the node for which the data is being filtered for.

        gender (str, optional):
            The string (Male or Female) for the gender that data is being filtered for.

        show_avg_per_run (bool, optional):
            True indicates that the data is an average over multiple runs.

        show_fraction (bool, optional):
            True indicates that the data is not true counts but a fraction
            (i.e. a count divided by another counter)

        show_fraction_of (bool, optional):
            When the denominator of the fraction can be different things,
            say population or infected, this allows you to specify the option
            that is not population.

        fraction_of_str (str, optional):
            If 'show_fraction_of' is true, then this argument an be used to include
            in the plot what the denominator is.

        hiv_negative (bool, optional):
            If True, then the title will indicate that the data is for people without HIV.

        has_age_bins (bool, optional):
            If True, then 'by Age' is added to the title.

    Returns:
        A string to be used a the top line title of the plot.
    """
    title = ""
    if show_avg_per_run and not show_fraction:
        title = "Average Per Run "
    elif not show_avg_per_run and show_fraction:
        title = title + "Fraction of "
    elif show_avg_per_run and show_fraction:
        title = "Average Fraction of "
    if show_fraction:
        if show_fraction_of:
            title = title + fraction_of_str
    if gender == "Female":
        title = title + "Females"
    elif gender == "Male":
        title = title + "Males"
    else:
        title = title + "People"

    if hiv_negative:
        title = title + " Without HIV"

    if base_title:
        title = title + " " + base_title

    if has_age_bins:
        title = title + " by Age"

    if node_id is not None:
        title = title + " - Node " + str(node_id)

    if not show_avg_per_run:
        title = title + " Per Run"

    return title




[docs]
def create_y_axis_name(base_title: str = "",
                       node_id: int = None,
                       gender: str = None,
                       show_avg_per_run: bool = False,
                       show_fraction: bool = False,
                       show_fraction_of: bool = False,
                       fraction_of_str: str = "",
                       has_age_bins: bool = False):
    """
    Given the arguments, create a label for the y-axis that describes the data being plotted.

    Args:
        base_title (str, optional):
            This is the core string to place in the y-label.  It describes what is being plotted.

        node_id (int, optional):
            TBD

        gender (str, optional):
            The string (Male or Female) for the gender that data is being filtered for.

        show_avg_per_run (bool, optional):
            TBD

        show_fraction (bool, optional):
            True indicates that the data is not true counts but a fraction
            (i.e. a count divided by another counter)

        show_fraction_of (bool, optional):
            When the denominator of the fraction can be different things,
            say population or infected, this allows you to specify the option
            that is not population.

        fraction_of_str (str, optional):
            If 'show_fraction_of' is true, then this argument an be used to include
            in the plot what the denominator is.

        has_age_bins (bool, optional):
            TBD

    Returns:
        A string to be used a the top line title of the plot.
    """
    y_axis_name = "Number of "
    if show_fraction:
        y_axis_name = "Fraction of "
        if show_fraction_of:
            y_axis_name = y_axis_name + fraction_of_str
    if gender == "Female":
        y_axis_name = y_axis_name + "Females"
    elif gender == "Male":
        y_axis_name = y_axis_name + "Males"
    else:
        y_axis_name = y_axis_name + "People"
    y_axis_name = y_axis_name + base_title

    return y_axis_name




[docs]
def extract_population_data(filename: str,
                            node_id: int = None,
                            gender: str = None,
                            age_bin: float = None,
                            other_strat_column_name: str = None,
                            other_strat_value: Union[int, float, str] = None):
    """
    Extract population data for a specific node, gender and age.

    It is assumed that the file has ages every 5 years from 0 to 100.
    """
    df = pd.read_csv(filename)

    # -------------------------------------------
    # Verify the CSV file has the correct columns
    # -------------------------------------------
    if COL_NAME_POP not in df.columns:
        raise ValueError(f"'{COL_NAME_POP}' column does not exist in the file({filename}).")
    if other_strat_column_name and (other_strat_column_name not in df.columns):
        raise ValueError(f"'{other_strat_column_name}' column does not exist in the file({filename}).")

    if( ((other_strat_column_name is not None) and (other_strat_value is     None)) or # noqa: E201, E271, E275, W504
        ((other_strat_column_name is     None) and (other_strat_value is not None)) ): # noqa: E202, E271, E129
        raise ValueError("Both 'other_strat_column_name' and 'other_strat_value' must be specified.")

    # ---------------------------------------------------------------
    # Determine which columns to put in the pivot table and create it
    # ---------------------------------------------------------------
    pv_columns = []
    if node_id is not None:
        pv_columns.append(COL_NAME_ON_ART)
    if gender is not None:
        pv_columns.append(COL_NAME_GENDER)
    if other_strat_column_name is not None:
        pv_columns.append(other_strat_column_name)
    if age_bin is not None:
        pv_columns.append(COL_NAME_AGE)

    gender_id = 1
    if gender == "Male":
        gender_id = 0

    pv = df.pivot_table(index=COL_NAME_YEAR,
                        columns=pv_columns,
                        values=COL_NAME_POP,
                        aggfunc="sum")

    # ------------------------------------------------------------
    # Extract data from pivot table and put in dataframe.
    # If the input age_bin is 80, we need to include the data from
    # the older ages because 80 for UN World Pop is really 80+.
    # ------------------------------------------------------------
    df2 = pd.DataFrame()
    df2.index = pv.index.values
    df2[COL_NAME_POP] = 0

    list_for_tuple = []
    if node_id is not None:
        list_for_tuple.append(node_id)
    if gender is not None:
        list_for_tuple.append(gender_id)
    if other_strat_column_name is not None:
        list_for_tuple.append(other_strat_value)
    if age_bin is not None:
        age_bin_list = [age_bin]
        if age_bin == 80:
            age_bin_list = [80, 85, 90, 95, 100]
        for age in age_bin_list:
            list_for_tuple_with_age = list_for_tuple.copy()
            list_for_tuple_with_age.append(age)
            if len(list_for_tuple_with_age) == 1:
                column_tuple = list_for_tuple_with_age[0]
            else:
                column_tuple = tuple(list_for_tuple_with_age)
            df2[COL_NAME_POP] = df2[COL_NAME_POP] + pv[column_tuple]
    else:
        if len(list_for_tuple) == 1:
            column_tuple = list_for_tuple[0]
        else:
            column_tuple = tuple(list_for_tuple)
        df2[COL_NAME_POP] = pv[column_tuple]

    return df2




[docs]
def extract_population_data_multiple_ages(filename: str,
                                          node_id: int = None,
                                          gender: str = None,
                                          age_bin_list: list[float] = None,
                                          filter_by_hiv_negative: bool = False,
                                          other_strat_column_name: str = None,
                                          other_strat_value: Union[int, float, str] = None,
                                          other_data_column_names: list[str] = None):
    """
    Extract population data for multiple ages for a specific node and gender.
    """
    df = pd.read_csv(filename)

    # -----------------------------------------
    # Verify the report had the correct columns
    # -----------------------------------------
    if COL_NAME_POP not in df.columns:
        raise ValueError(f"'Population' column does not exist in the file({filename}).")
    if other_strat_column_name is not None and other_strat_column_name not in df.columns:
        raise ValueError(f"'{other_strat_column_name}' column does not exist in the file({filename}).")

    if( ((other_strat_column_name is not None) and (other_strat_value is     None)) or # noqa: E201, E271, E275, W504
        ((other_strat_column_name is     None) and (other_strat_value is not None)) ): # noqa: E202, E271, E129
        raise ValueError("Both 'other_strat_column_name' and 'other_strat_value' must be specified.")

    if other_data_column_names is None:
        other_data_column_names = []

    # -----------------------------------------------------
    # Determine what data to put in to the pivot table and
    # create pivot table
    # -----------------------------------------------------
    pv_columns = []
    age_data_list = []
    if node_id is not None:
        pv_columns.append(COL_NAME_NODE_ID)
    if gender is not None:
        pv_columns.append(COL_NAME_GENDER)
    if other_strat_column_name is not None:
        pv_columns.append(other_strat_column_name)
    if age_bin_list is not None:
        pv_columns.append(COL_NAME_AGE)
        age_data_list = df[COL_NAME_AGE].unique()

    data_is_for_hiv_negative = False
    if filter_by_hiv_negative:
        if COL_NAME_HAS_HIV in df.columns:
            df = df[df[COL_NAME_HAS_HIV] == 0]
            data_is_for_hiv_negative = True

    data_columns = COL_NAME_POP
    if len(other_data_column_names) > 0:
        data_columns = []
        data_columns.append(COL_NAME_POP)
        for col_name in other_data_column_names:
            if col_name is not None and col_name not in df.columns:
                raise ValueError(f"'{col_name}' column does not exist in the file({filename}).")
            data_columns.append(col_name)

    gender_id = 1
    if gender == "Male":
        gender_id = 0

    pv = df.pivot_table(index=COL_NAME_YEAR,
                        columns=pv_columns,
                        values=data_columns,
                        aggfunc="sum")

    # -------------------------------------------------------
    # Move the data from the pivot table to a new dataframe.
    # If doing ages, we need to create labels for the columns
    # that includes the age ranges.
    # -------------------------------------------------------
    df2 = pd.DataFrame()
    df2.index = pv.index.values

    new_column_names = []
    new_column_names.append(COL_NAME_POP)
    for col_name in other_data_column_names:
        new_column_names.append(col_name)

    list_for_tuple = []
    if node_id is not None:
        list_for_tuple.append(node_id)
    if gender is not None:
        list_for_tuple.append(gender_id)
    if other_strat_column_name is not None:
        list_for_tuple.append(other_strat_value)
    if age_bin_list:
        for age_index in range(len(age_bin_list) - 1):
            age_min = age_bin_list[age_index]
            age_max = age_bin_list[age_index + 1]
            age_label = ":" + str(age_min) + " - " + str(age_max)
            for col_name in new_column_names:
                df2[col_name + age_label] = 0
            for age in age_data_list:
                if (age_min <= age) and (age < age_max):
                    for col_name in new_column_names:
                        list_for_tuple_with_age = []
                        if len(other_data_column_names) > 0:
                            list_for_tuple_with_age.append(col_name)
                        list_for_tuple_with_age.extend(list_for_tuple)
                        list_for_tuple_with_age.append(float(age))
                        if len(list_for_tuple_with_age) == 1:
                            column_tuple = list_for_tuple_with_age[0]
                        else:
                            column_tuple = tuple(list_for_tuple_with_age)
                        df2[col_name + age_label] = df2[col_name + age_label] + pv[column_tuple]
    else:
        for col_name in new_column_names:
            column_tuple = col_name
            if len(list_for_tuple) > 0:
                list_for_tuple_copy = []
                if len(other_data_column_names) > 0:
                    list_for_tuple_copy.append(col_name)
                list_for_tuple_copy.extend(list_for_tuple)
                if len(list_for_tuple_copy) == 1:
                    column_tuple = list_for_tuple_copy[0]
                else:
                    column_tuple = tuple(list_for_tuple_copy)
            df2[col_name] = pv[column_tuple]

    return df2, data_is_for_hiv_negative




[docs]
def extract_population_data_by_stratification(filename: str,
                                              node_id: int = None,
                                              gender: str = None,
                                              age_bin_list: list[float] = None,
                                              start_column_name: str = None,
                                              strat_values: list[str] = None):
    """
    Extract population data such that you get the population for each stratification.
    """
    if age_bin_list is not None and len(age_bin_list) < 2:
        raise ValueError("'age_bin_list' must have at least two values.\n"
                         + "The second value is the max of the i-th bin and the min of the (i+1)-th bin.")

    df = pd.read_csv(filename)

    # --------------------------------------------
    # Verify the CSV file has the expected columns
    # --------------------------------------------
    if COL_NAME_POP not in df.columns:
        raise ValueError(f"'Population' column does not exist in the file({filename}).")
    if start_column_name not in df.columns:
        raise ValueError(f"'{start_column_name}' column does not exist in the file({filename}).")

    if strat_values is None:
        strat_values = df[start_column_name].unique()
        strat_values = [x for x in strat_values if str(x) != 'nan']

    # -----------------------------------------------------
    # Determine what data to put in to the pivot table and
    # create pivot table
    # -----------------------------------------------------
    pv_columns = []
    age_data_list = []
    pv_columns.append(start_column_name)
    if node_id is not None:
        pv_columns.append(COL_NAME_NODE_ID)
    if gender is not None:
        pv_columns.append(COL_NAME_GENDER)
    if age_bin_list is not None:
        pv_columns.append(COL_NAME_AGE)
        age_data_list = df[COL_NAME_AGE].unique()

    data_columns = COL_NAME_POP

    gender_id = 1
    if gender == "Male":
        gender_id = 0

    pv = df.pivot_table(index=COL_NAME_YEAR,
                        columns=pv_columns,
                        values=data_columns,
                        aggfunc="sum")

    # -------------------------------------------------------
    # Move the data from the pivot table to a new dataframe.
    # If doing ages, we need to create labels for the columns
    # that includes the age ranges.
    # -------------------------------------------------------
    df2 = pd.DataFrame()
    df2.index = pv.index.values

    for strat_value in strat_values:
        list_for_tuple = []
        list_for_tuple.append(strat_value)
        if node_id is not None:
            list_for_tuple.append(node_id)
        if gender is not None:
            list_for_tuple.append(gender_id)
        if age_bin_list:
            for age_index in range(len(age_bin_list) - 1):
                age_min = age_bin_list[age_index]
                age_max = age_bin_list[age_index + 1]
                age_label = ":" + str(age_min) + " - " + str(age_max)
                df2[strat_value + age_label] = 0
                for age in age_data_list:
                    if (age_min <= age) and (age < age_max):
                        list_for_tuple_with_age = list_for_tuple.copy()
                        list_for_tuple_with_age.append(float(age))
                        column_tuple = tuple(list_for_tuple_with_age)
                        df2[strat_value + age_label] = df2[strat_value + age_label] + pv[column_tuple]
        else:
            column_tuple = strat_value
            if len(list_for_tuple) > 0:
                list_for_tuple_copy = list_for_tuple.copy()
                if len(list_for_tuple_copy) == 1:
                    column_tuple = list_for_tuple_copy[0]
                else:
                    column_tuple = tuple(list_for_tuple_copy)
            df2[strat_value] = pv[column_tuple]

    return df2




[docs]
def extract_population_data_by_stratification_for_dir(dir_or_filename: str,
                                                      node_id: int = None,
                                                      gender: str = None,
                                                      age_bin_list: list[float] = None,
                                                      start_column_name: str = None,
                                                      strat_values: list[str] = None,
                                                      show_avg_per_run: bool = False):
    combined_df = pd.DataFrame()

    dir_filenames = helpers.get_filenames(dir_or_filename=dir_or_filename,
                                          file_prefix="ReportHIVByAgeAndGender",
                                          file_extension=".csv")

    for fn in dir_filenames:
        print(f"Extracting data from {fn}")
        df = extract_population_data_by_stratification(filename=fn,
                                                       node_id=node_id,
                                                       gender=gender,
                                                       age_bin_list=age_bin_list,
                                                       start_column_name=start_column_name,
                                                       strat_values=strat_values)

        if len(combined_df.columns) == 0:
            combined_df.index = df.index
            for column_name in df.columns:
                name = column_name
                if not show_avg_per_run:
                    name = fn + "-" + name
                combined_df[name] = 0
        for column_name in df.columns:
            if show_avg_per_run:
                combined_df[column_name] = combined_df[column_name] + df[column_name]
            else:
                name = fn + "-" + column_name
                combined_df[name] = df[column_name]

    if show_avg_per_run:
        for column_name in combined_df.columns:
            combined_df[column_name] = combined_df[column_name] / len(dir_filenames)

    col_name_prefixs = [""]
    if not show_avg_per_run:
        col_name_prefixs = []
        for fn in dir_filenames:
            col_name_prefixs.append(fn + "-")

    return combined_df, col_name_prefixs




[docs]
def create_df_for_plot_by_stratification(combined_df: pd.DataFrame,
                                         col_name_prefixs: list[str],
                                         age_bin_list: list[float] = None,
                                         show_fraction: bool = False):
    if show_fraction:
        total_df = pd.DataFrame()
        total_df.index = combined_df.index

        age_str_list = [""]
        if age_bin_list:
            for age_index in range(len(age_bin_list) - 1):
                age_min = age_bin_list[age_index]
                age_max = age_bin_list[age_index + 1]
                age_str_list.append(":" + str(age_min) + " - " + str(age_max))

        for prefix in col_name_prefixs:
            for age_str in age_str_list:
                total_label = "total-" + prefix + "-" + age_str
                total_df[total_label] = 0
                for column_name in combined_df.columns:
                    if (age_str in column_name) and (prefix in column_name):
                        total_df[total_label] = total_df[total_label] + combined_df[column_name]
                for column_name in combined_df.columns:
                    if (age_str in column_name) and (prefix in column_name):
                        combined_df[column_name] = combined_df[column_name] / total_df[total_label]

    return combined_df




[docs]
def plot_population_for_dir(dir_or_filename: str,
                            unworld_pop_filename: str,
                            country: str,
                            version: str,
                            x_base_population: float = 1.0,
                            show_avg_per_run: bool = False,
                            gender: str = None,
                            age_bin: float = None,
                            img_dir: str = None):
    """
    Plot the population for the given age bin against the data in the UN World Population spreadsheet.

    Args:
        dir_or_filename (str, required):
            The directory or filename containing the ReportHIVByAgeAndGender.csv files.

        unworld_pop_filename (str, required):
            The name and path to a UN World Pop Excel spreadsheet where the 'country' parameter specifies
            a country name found in the spreadsheet and the 'version' specifies the year of the data.
            These values are needed to know how to read the data in the spreadsheet.

        country (str, required):
            The name of the country found in the spreadsheet to extract the data for.

        version (str, required):
            The year associated with when the UN World Pop file was created.
            PLEASE NOTE: This year is a string.

        x_base_population (float, optional):
            The 'x_Base_Population' value (found in the config) is used to divide by the population
            numbers in the CSV file so you get numbers that match the true population.

        show_avg_per_run (bool, optional):
            If 'dir_or_filename' is a directory, this will calculate the average number of
            people with the given risk type at a given time step between the files.
            Default is False.

        gender (str, optional):
            The string (Male or Female) for the gender that data is being filtered for.

        age_bin (float, optional):
            If provided, the data for this specific age stratification will be plotted.
            Both the data in the report file and the UN World Pop file must have this
            stratification.  If you do not provide a value, then the population is not
            broken up by age (i.e. total population).

        img_dir (str, optional):
            Directory to save the images. If None, the images will not be saved and a window will be opened.

    Returns:
        None - but image will be saved or window opened.
    """
    if x_base_population <= 0.0:
        raise ValueError("'x_base_population' must be a value greater than zero.")

    # Get the file or files in the directory
    dir_filenames = helpers.get_filenames(dir_or_filename=dir_or_filename,
                                          file_prefix="ReportHIVByAgeAndGender",
                                          file_extension=".csv")

    # ---------------------------------------------------------------
    # Extract the data from each file and combine into one dataframe
    # ---------------------------------------------------------------
    combined_df = pd.DataFrame()
    for fn in dir_filenames:
        df = extract_population_data(fn, gender=gender, age_bin=age_bin)
        if len(combined_df.columns) == 0:
            combined_df.index = df.index
            if show_avg_per_run:
                combined_df[COL_NAME_POP] = 0
        if show_avg_per_run:
            combined_df[COL_NAME_POP] = combined_df[COL_NAME_POP] + df[COL_NAME_POP] / x_base_population
        else:
            combined_df[fn] = df[COL_NAME_POP] / x_base_population

    if show_avg_per_run:
        for column_name in combined_df.columns:
            combined_df[column_name] = combined_df[column_name] / len(dir_filenames)

    # ----------------------------------------------------------------
    # Extract the expected population data from the UN World Pop file
    # and put into dataframe.  Get the data for the years that you have
    # in the CSV file.
    # ----------------------------------------------------------------
    years = combined_df.index.values.astype(int).tolist()
    years = list(set(years))

    date_column = COL_NAME_YEAR
    if version == "2015":
        date_column = "Reference date (as of 1 July)"

    unwp_df2 = None
    title2 = dir_or_filename
    if unworld_pop_filename:
        title2 = str(unworld_pop_filename)
        unwp_df = unwp.extract_population_by_age(country=country,
                                                 version=version,
                                                 years=years,
                                                 filename=unworld_pop_filename)
        unwp_df.index = unwp_df[date_column].astype(float)
        del unwp_df[date_column]
        rename_dict = {}
        for name in unwp_df.columns:
            rename_dict[name] = int(name)
        unwp_df = unwp_df.rename(columns=rename_dict)

        unwp_df2 = pd.DataFrame()
        unwp_df2.index = unwp_df.index.astype(float)
        if age_bin is not None:
            unwp_df2["Expected Population"] = unwp_df[age_bin]
        else:
            unwp_df2["Expected Population"] = unwp_df.sum(axis=1)

    # --------------------------
    # create title and plot data
    # --------------------------
    title = ""
    if show_avg_per_run:
        title = title + "Average Per Run"
    title = title + "Population"
    if gender is not None:
        title = title + " - " + gender
    if age_bin is not None:
        if age_bin == 80:
            title = title + " - " + str(age_bin) + "+"
        else:
            title = title + " - " + str(age_bin) + "-" + str(age_bin + 5)

    if not TEST_include_dir_or_filename:
        title2 = None

    xy_plot.xy_plot(img_dir=img_dir,
                    df=combined_df,
                    expected_df=unwp_df2,
                    title_1=title,
                    title_2=title2,
                    y_axis_name="Number of People",
                    fraction_of_total=False,
                    show_legend=show_avg_per_run,
                    show_markers=show_avg_per_run,
                    min_x=None, max_x=None, min_y=None, max_y=None,
                    x_axis_as_log_scale=False,
                    y_axis_as_log_scale=False)




[docs]
def plot_population_by_gender(filename: str,
                              img_dir: str = None):
    """
    For the given file, plot the population for each gender over time.

    Args
        filename (str, required):
            The name and path of the ReportHIVByAgeAndGender.csv file to extract the data from.

        img_dir (str, optional):
            Directory to save the images. If None, the images will not be saved and a window will be opened.

    Returns:
        None - but image will be saved or window opened.
    """
    if not os.path.isfile(filename):
        raise ValueError(f"The filename, '{filename}' given does not appear to be a file.")

    df = pd.read_csv(filename)

    if COL_NAME_GENDER not in df.columns:
        raise ValueError(f"'{COL_NAME_GENDER}' column does not exist in the file({filename}).")

    pv = df.pivot_table(index=COL_NAME_YEAR,
                        columns=[COL_NAME_GENDER],
                        values=COL_NAME_POP,
                        aggfunc="sum")

    df2 = pd.DataFrame()
    df2.index = pv.index
    df2["Number of Men"] = pv[0]
    df2["Number of Women"] = pv[1]

    xy_plot.xy_plot(img_dir=img_dir,
                    df=df2,
                    title_1="Population by Gender",
                    title_2=None,
                    y_axis_name="Number of People",
                    fraction_of_total=False,
                    min_x=None, max_x=None, min_y=None, max_y=None,
                    x_axis_as_log_scale=False,
                    y_axis_as_log_scale=False)




[docs]
def plot_population_by_ip(dir_or_filename: str,
                          exp_dir_or_filename: str = None,
                          node_id: int = None,
                          gender: str = None,
                          age_bin_list=None,
                          ip_key: str = None,
                          ip_values: list[str] = None,
                          show_avg_per_run: bool = False,
                          show_fraction: bool = False,
                          expected_values: dict = None,
                          img_dir: str = None):
    """
    For the indicated files, create a plot showing who has what value of the give IP key over time.

    Args:
        dir_or_filename (str, required):
            The directory or filename containing the ReportHIVByAgeAndGender.csv files.

        exp_dir_or_filename (str, required):
            The expected or alternate directory or filename containing the ReportHIVByAgeAndGender.csv files.

        node_id (int, optional):
            The ID of the node for which the data is being filtered for.

        gender (str, optional):
            The string (Male or Female) for the gender that data is being filtered for.

        age_bin_list (list[float], optional):
            A list of ages, in years, where the population with risk value will be counted
            for each bin.  For example, if you enter [10, 25, 30, 55], there will be three
            age bins with the following ranges: [10-25), [25-30), [30-55)

        ip_key (str, required):
            Extract the data from the files based on this IP stratification column.
            If the files has not been stratified by this IP, you will need to re-run the
            simulations with stratification turned on.

        ip_values (list[str], optional):
            By default, this plotting tool uses all of the values for the IP, however, this allows
            you to only include the ones you are interested (i.e. a subset to the total possible
            values for the IP)

        show_avg_per_run (bool, optional):
            If 'dir_or_filename' is a directory, this will calculate the average number of
            people with the given risk type at a given time step between the files.
            Default is False.

        show_fraction (bool, optional):
            True indicates that the number of people the given IP value will be divided by
            the sum of all the people with all of the selected IP values.  For example, if the
            IP key were Risk and you selected to only plot LOW and MEDIUM, then the total number
            of people with LOW will be divided by the total number of people with either LOW or MEDIUM.

        expected_values (dict, optional):
            If the user provides this dictionary, the constant expected value of each IP will be
            plotted.  This should be a dictionary with an IP value as the key and a constant expected value.
            There should be an IP value for each IP value plotted.

        img_dir (str, optional):
            Directory to save the images. If None, the images will not be saved and a window will be opened.

    Returns:
        None - but image will be saved or window opened.
    """

    if (exp_dir_or_filename is not None) and (expected_values is not None):
        raise ValueError("You cannot specify both 'exp_dir_or_filename' and 'expected_values'. "
                         + "Please only specify one of these.")

    if ip_key is None:
        raise ValueError("IP Key must be specified.")
    ip_col_name = " IP_Key:" + ip_key

    combined_df, col_name_prefixs = extract_population_data_by_stratification_for_dir(dir_or_filename=dir_or_filename,
                                                                                      node_id=node_id,
                                                                                      gender=gender,
                                                                                      age_bin_list=age_bin_list,
                                                                                      start_column_name=ip_col_name,
                                                                                      strat_values=ip_values,
                                                                                      show_avg_per_run=show_avg_per_run)

    combined_df = create_df_for_plot_by_stratification(combined_df=combined_df,
                                                       col_name_prefixs=col_name_prefixs,
                                                       age_bin_list=age_bin_list,
                                                       show_fraction=show_fraction)

    expected_df = None
    if expected_values:
        expected_df = pd.DataFrame()
        expected_df.index = combined_df.index
        for value in ip_values:
            expected_df["Expected: " + value] = expected_values[value]
    elif exp_dir_or_filename is not None:
        expected_df, exp_col_name_prefixs = extract_population_data_by_stratification_for_dir(dir_or_filename=exp_dir_or_filename,
                                                                                              node_id=node_id,
                                                                                              gender=gender,
                                                                                              age_bin_list=age_bin_list,
                                                                                              start_column_name=ip_col_name,
                                                                                              strat_values=ip_values,
                                                                                              show_avg_per_run=show_avg_per_run)
        expected_df = create_df_for_plot_by_stratification(combined_df=expected_df,
                                                           col_name_prefixs=exp_col_name_prefixs,
                                                           age_bin_list=age_bin_list,
                                                           show_fraction=show_fraction)

    base_title = "with " + ip_key + "=X"
    title = create_title(base_title=base_title,
                         node_id=node_id,
                         gender=gender,
                         show_avg_per_run=show_avg_per_run,
                         show_fraction=show_fraction,
                         show_fraction_of=False,
                         fraction_of_str=None,
                         has_age_bins=(age_bin_list is not None))
    y_axis_name = create_y_axis_name(base_title=" " + base_title,
                                     node_id=node_id,
                                     gender=gender,
                                     show_avg_per_run=show_avg_per_run,
                                     show_fraction=show_fraction,
                                     show_fraction_of=False,
                                     fraction_of_str=None,
                                     has_age_bins=(age_bin_list is not None))

    title2 = None
    if TEST_include_dir_or_filename:
        if exp_dir_or_filename is not None:
            title2 = "color= " + dir_or_filename
            title2 = title2 + "\nblack = " + exp_dir_or_filename
        else:
            title2 = dir_or_filename

    xy_plot.xy_plot(img_dir=img_dir,
                    df=combined_df,
                    expected_df=expected_df,
                    title_1=title,
                    title_2=title2,
                    y_axis_name=y_axis_name,
                    show_legend=show_avg_per_run,
                    fraction_of_total=False,
                    min_x=None, max_x=None, min_y=None, max_y=None,
                    x_axis_as_log_scale=False,
                    y_axis_as_log_scale=False)




[docs]
def plot_columns(filename: str,
                 title: str,
                 y_axis_name: str,
                 column_names: list[str],
                 fraction_of_population: bool = False,
                 img_dir: str = None):
    """
    For a given file, plot the indicated columns versus time (i.e. Year).

    Args:
        filename (str, required):
            The name and path of the ReportHIVByAgeAndGender.csv file to extract the data from.

        title (str, required):
            The title to put at the top of the plot.

        y_axis_name (str, required):
            The name to label the y-axis on the plot.

        column_names (list[str], required):
            The list of column names to plot the data for.  The report has a space before each
            column name.  Please be sure to include it.

        fraction_of_population (bool, optional):
            If True, divide the count each column by the population for the same stratification.

        img_dir (str, optional):
            Directory to save the images. If None, the images will not be saved and a window will be opened.

    Returns:
        None - but image will be saved or window opened.
    """
    if not os.path.isfile(filename):
        raise ValueError(f"The filename, '{filename}' given does not appear to be a file.")

    df = pd.read_csv(filename)

    for name in column_names:
        if name not in df.columns:
            raise ValueError(f"'{name}' column does not exist in the file({filename}).")

    if fraction_of_population:
        column_names.append(COL_NAME_POP)

    pv = df.pivot_table(index=COL_NAME_YEAR,
                        columns=[],
                        values=column_names,
                        aggfunc="sum")

    df2 = pd.DataFrame()
    df2.index = pv.index
    for value in column_names:
        if fraction_of_population and (value != COL_NAME_POP):
            df2[value] = pv[value] / pv[COL_NAME_POP]
        elif not fraction_of_population:
            df2[value] = pv[value]

    xy_plot.xy_plot(img_dir=img_dir,
                    df=df2,
                    title_1=title,
                    title_2=None,
                    y_axis_name=y_axis_name,
                    fraction_of_total=False,
                    min_x=None, max_x=None, min_y=None, max_y=None,
                    x_axis_as_log_scale=False,
                    y_axis_as_log_scale=False)




[docs]
def plot_circumcision_by_age(filename: str,
                             age_bin_list: list[float],
                             fraction_of_total: bool = False,
                             img_dir: str = None):
    """
    For a single file, plot the number of men who are circumcised by age.

    Args:
        filename (str, required):
            The name and path of the ReportHIVByAgeAndGender.csv to extract the data from.

        age_bin_list (list[float], optional):
            A list of ages in years where the population with risk value will be counted
            for each bin.  For example, if you enter [10, 25, 30, 55], there will be three
            age bins with the following ranges: [10-25), [25-30), [30-55)

        fraction_of_total (bool, optional):
            If True, the number of men who are circumcised will be divided by the total number
            of men with that stratification.

        img_dir (str, optional):
            Directory to save the images. If None, the images will not be saved and a window will be opened.

    Returns:
        None - but image will be saved or window opened.
    """
    if not os.path.isfile(filename):
        raise ValueError(f"The filename, '{filename}' given does not appear to be a file.")

    df = pd.read_csv(filename)

    if COL_NAME_GENDER not in df.columns:
        raise ValueError(f"'{COL_NAME_GENDER}' column does not exist in the file({filename}).")
    if COL_NAME_AGE not in df.columns:
        raise ValueError(f"'{COL_NAME_AGE}' column does not exist in the file({filename}).")

    ages = df[COL_NAME_AGE].unique()

    y_axis_name = "Number of Circumcised Men"
    if fraction_of_total:
        y_axis_name = "Fraction of Men Circumcised by Age"

    pv = df.pivot_table(index=COL_NAME_YEAR,
                        columns=[COL_NAME_GENDER, COL_NAME_AGE, COL_NAME_IS_CIRC],
                        values=COL_NAME_POP,
                        aggfunc="sum")

    df2 = pd.DataFrame()
    df2.index = pv.index

    gender_id = 0
    is_circumcised = 1
    for index in range(len(age_bin_list) - 1):
        label = f"Men Circumcised: {age_bin_list[index]}-{age_bin_list[index + 1]}"
        pv[label] = 0
        pv["total"] = 0
        for age in ages:
            if (age_bin_list[index] <= age) and (age < age_bin_list[index + 1]):
                pv["total"] = pv["total"] + pv[(gender_id, age, 0)] + pv[(gender_id, age, 1)]
                pv[label] = pv[label] + pv[(gender_id, age, is_circumcised)]
        df2[label] = pv[label]
        if fraction_of_total:
            df2[label] = df2[label] / pv["total"]

    xy_plot.xy_plot(img_dir=img_dir,
                    df=df2,
                    title_1="Circumcised Men by Age",
                    title_2=None,
                    y_axis_name=y_axis_name,
                    fraction_of_total=False,
                    min_x=None, max_x=None, min_y=None, max_y=None,
                    x_axis_as_log_scale=False,
                    y_axis_as_log_scale=False)




[docs]
def extract_population_data_multiple_ages_for_dir(dir_or_filename: str,
                                                  node_id: int = None,
                                                  gender: str = None,
                                                  age_bin_list: list[float] = None,
                                                  show_avg_per_run: bool = False,
                                                  filter_by_hiv_negative: bool = False,
                                                  other_strat_column_name: str = None,
                                                  other_strat_value_a: Union[int, float, str] = None,
                                                  other_strat_value_b: Union[int, float, str] = None,
                                                  other_data_column_names: list[str] = None):
    combined_df = pd.DataFrame()

    data_is_for_hiv_negative = False
    dir_filenames = helpers.get_filenames(dir_or_filename=dir_or_filename,
                                          file_prefix="ReportHIVByAgeAndGender",
                                          file_extension=".csv")
    for fn in dir_filenames:
        print("extracting data from " + fn)
        df_a, hiv_neg_a = extract_population_data_multiple_ages(fn,
                                                                node_id=node_id,
                                                                gender=gender,
                                                                age_bin_list=age_bin_list,
                                                                filter_by_hiv_negative=filter_by_hiv_negative,
                                                                other_strat_column_name=other_strat_column_name,
                                                                other_strat_value=other_strat_value_a,
                                                                other_data_column_names=other_data_column_names)
        df_b = None
        hiv_neg_b = None
        if other_strat_column_name is not None and other_strat_value_b is not None:
            df_b, hiv_neg_b = extract_population_data_multiple_ages(fn,
                                                                    node_id=node_id,
                                                                    gender=gender,
                                                                    age_bin_list=age_bin_list,
                                                                    filter_by_hiv_negative=filter_by_hiv_negative,
                                                                    other_strat_column_name=other_strat_column_name,
                                                                    other_strat_value=other_strat_value_b,
                                                                    other_data_column_names=other_data_column_names)

        if hiv_neg_b is not None and hiv_neg_a != hiv_neg_b:
            raise ValueError("The two dataframes for the two stratifications do not have the same HIV negative status. "
                             + "This is likely because you are trying to extract data for a stratification that does not exist in the file.")
        data_is_for_hiv_negative = hiv_neg_a

        if len(combined_df.columns) == 0:
            combined_df.index = df_a.index
            for column_name in df_a.columns:
                name = column_name
                if not show_avg_per_run:
                    name = fn + "-" + name
                combined_df[name] = 0
        for column_name in df_a.columns:
            if show_avg_per_run:
                if df_b is not None:
                    combined_df[column_name] = combined_df[column_name] + df_a[column_name] / (df_a[column_name] + df_b[column_name])
                else:
                    combined_df[column_name] = combined_df[column_name] + df_a[column_name]
            else:
                name = fn + "-" + column_name
                if df_b is not None:
                    combined_df[name] = df_a[column_name] / (df_a[column_name] + df_b[column_name])
                else:
                    combined_df[name] = df_a[column_name]

    if show_avg_per_run:
        for column_name in combined_df.columns:
            combined_df[column_name] = combined_df[column_name] / len(dir_filenames)

    col_name_prefixs = [""]
    if not show_avg_per_run:
        col_name_prefixs = []
        for fn in dir_filenames:
            col_name_prefixs.append(fn + "-")

    return combined_df, col_name_prefixs, data_is_for_hiv_negative




[docs]
def create_df_for_plot_by_age(combined_df: pd.DataFrame,
                              col_name_prefixs: list[str],
                              main_column_name: str,
                              gender: str,
                              age_bins: list[float],
                              show_fraction: bool,
                              fraction_of: bool,
                              fraction_of_column_name: str):
    df2 = pd.DataFrame()
    df2.index = combined_df.index

    if show_fraction:
        for prefix in col_name_prefixs:
            fraction_label = prefix + main_column_name + "/Population"
            if fraction_of:
                fraction_label = prefix + main_column_name + "/" + fraction_of_column_name[1:]
            if age_bins:
                for age_index in range(len(age_bins) - 1):
                    age_min = age_bins[age_index    ]  # noqa: E202
                    age_max = age_bins[age_index + 1]
                    main_label         = prefix + main_column_name        + ":" + str(age_min) + " - " + str(age_max) # noqa: E221
                    pop_label          = prefix + COL_NAME_POP            + ":" + str(age_min) + " - " + str(age_max) # noqa: E221
                    other_label        = prefix + fraction_of_column_name + ":" + str(age_min) + " - " + str(age_max) # noqa: E221
                    fraction_label_age = fraction_label                   + ":" + str(age_min) + " - " + str(age_max) # noqa: E221
                    if fraction_of:
                        df2[fraction_label_age] = combined_df[main_label] / combined_df[other_label]
                    else:
                        df2[fraction_label_age] = combined_df[main_label] / combined_df[pop_label]
                    df2[fraction_label_age] = df2[fraction_label_age].fillna(0)
            else:
                if fraction_of:
                    df2[fraction_label] = combined_df[prefix + main_column_name] / combined_df[prefix + fraction_of_column_name]
                else:
                    df2[fraction_label] = combined_df[prefix + main_column_name] / combined_df[prefix + COL_NAME_POP]
                df2[fraction_label] = df2[fraction_label].fillna(0)
    else:
        new_column_names = {}
        for column_name in combined_df.columns:
            if main_column_name not in column_name:
                del combined_df[column_name]
            else:
                label = column_name
                if gender:
                    label = gender + column_name
                new_column_names[column_name] = label
        combined_df = combined_df.rename(columns=new_column_names)
        df2 = combined_df.copy()

    return df2




[docs]
def base_plot_by_age(base_title: str,
                     main_column_name: str,
                     dir_or_filename: str,
                     exp_dir_or_filename: str = None,
                     node_id: int = None,
                     age_bins: list[float] = None,
                     gender: str = None,
                     filter_by_hiv_negative: bool = False,
                     other_strat_column_name: str = None,
                     other_strat_value_a: Union[int, float, str] = None,
                     other_strat_value_b: Union[int, float, str] = None,
                     show_avg_per_run: bool = False,
                     show_fraction: bool = False,
                     fraction_of: bool = False,
                     fraction_of_column_name: str = None,
                     fraction_of_str: str = None,
                     img_dir: str = None):

    other_data_column_names = []
    if main_column_name != COL_NAME_POP:
        other_data_column_names.append(main_column_name)
    if show_fraction and fraction_of:
        other_data_column_names.append(fraction_of_column_name)

    combined_df, col_name_prefixs, hiv_neg = extract_population_data_multiple_ages_for_dir(dir_or_filename=dir_or_filename,
                                                                                           node_id=node_id,
                                                                                           gender=gender,
                                                                                           age_bin_list=age_bins,
                                                                                           show_avg_per_run=show_avg_per_run,
                                                                                           filter_by_hiv_negative=filter_by_hiv_negative,
                                                                                           other_strat_column_name=other_strat_column_name,
                                                                                           other_strat_value_a=other_strat_value_a,
                                                                                           other_strat_value_b=other_strat_value_b,
                                                                                           other_data_column_names=other_data_column_names)

    df2 = create_df_for_plot_by_age(combined_df=combined_df,
                                    col_name_prefixs=col_name_prefixs,
                                    main_column_name=main_column_name,
                                    gender=gender,
                                    age_bins=age_bins,
                                    show_fraction=show_fraction,
                                    fraction_of=fraction_of,
                                    fraction_of_column_name=fraction_of_column_name)

    exp_df2 = None
    exp_hiv_neg = None
    if exp_dir_or_filename:
        exp_combined_df, exp_col_name_prefixs, exp_hiv_neg = extract_population_data_multiple_ages_for_dir(dir_or_filename=exp_dir_or_filename,
                                                                                                           node_id=node_id,
                                                                                                           gender=gender,
                                                                                                           age_bin_list=age_bins,
                                                                                                           show_avg_per_run=show_avg_per_run,
                                                                                                           filter_by_hiv_negative=filter_by_hiv_negative,
                                                                                                           other_strat_column_name=other_strat_column_name,
                                                                                                           other_strat_value_a=other_strat_value_a,
                                                                                                           other_strat_value_b=other_strat_value_b,
                                                                                                           other_data_column_names=other_data_column_names)
        exp_df2 = create_df_for_plot_by_age(combined_df=exp_combined_df,
                                            col_name_prefixs=exp_col_name_prefixs,
                                            main_column_name=main_column_name,
                                            gender=gender,
                                            age_bins=age_bins,
                                            show_fraction=show_fraction,
                                            fraction_of=fraction_of,
                                            fraction_of_column_name=fraction_of_column_name)

    if exp_hiv_neg is not None and hiv_neg != exp_hiv_neg:
        raise ValueError("The two sets of data do not each have the HasHIV column.\n"
                         + "The files in {dir_or_filename} have HasHIV={hiv_neg}.\n"
                         + "The files in {exp_dir_or_filename} have HasHIV={exp_hiv_neg}.\n"
                         + "Please check the files and make sure they are correct.")

    title = create_title(base_title=base_title,
                         node_id=node_id,
                         gender=gender,
                         show_avg_per_run=show_avg_per_run,
                         show_fraction=show_fraction,
                         show_fraction_of=fraction_of,
                         fraction_of_str=fraction_of_str,
                         hiv_negative=hiv_neg,
                         has_age_bins=(age_bins is not None))
    y_axis_name = create_y_axis_name(base_title=" " + base_title,
                                     node_id=node_id,
                                     gender=gender,
                                     show_avg_per_run=show_avg_per_run,
                                     show_fraction=show_fraction,
                                     show_fraction_of=fraction_of,
                                     fraction_of_str=fraction_of_str,
                                     has_age_bins=(age_bins is not None))

    title2 = None
    if TEST_include_dir_or_filename:
        if exp_df2 is not None:
            title2 = "color=" + dir_or_filename
            title2 = title2 + "\nblack=" + exp_dir_or_filename
        else:
            title2 = dir_or_filename

    xy_plot.xy_plot(img_dir=img_dir,
                    df=df2,
                    expected_df=exp_df2,
                    title_1=title,
                    title_2=title2,
                    y_axis_name=y_axis_name,
                    show_legend=show_avg_per_run,
                    fraction_of_total=False,
                    min_x=None, max_x=None, min_y=None, max_y=None,
                    x_axis_as_log_scale=False,
                    y_axis_as_log_scale=False)




[docs]
def plot_onART_by_age(dir_or_filename: str,
                      exp_dir_or_filename: str = None,
                      node_id: int = None,
                      gender: str = None,
                      age_bin_list: list[float] = None,
                      show_avg_per_run: bool = False,
                      show_fraction: bool = False,
                      fraction_of_infected: bool = False,
                      img_dir: str = None):
    """
    Create a plot showing information about the people on ART.  You can show the fraction of the
    population or the fraction of the infected population.

    Args:
        dir_or_filename (str, required):
            The directory or filename containing the ReportHIVByAgeAndGender.csv files.

        exp_dir_or_filename (str, required):
            The expected or alternate directory or filename containing the ReportHIVByAgeAndGender.csv files.

        node_id (int, optional):
            The ID of the node for which the data is being filtered for.

        gender (str, optional):
            The string (Male or Female) for the gender that data is being filtered for.

        age_bin_list (list[float], optional):
            A list of ages in years where the population with risk value will be counted
            for each bin.  For example, if you enter [10, 25, 30, 55], there will be three
            age bins with the following ranges: [10-25), [25-30), [30-55)

        show_avg_per_run (bool, optional):
            If 'dir_or_filename' is a directory, this will calculate the average number of
            people with the given risk type at a given time step between the files.
            Default is False.

        show_fraction (bool, optional):
            True indicates that the number of people on ART will be divided by either the number
            of people in the population or the number of infected people.  It depends on the
            'fraction_of_infected' parameter.

        fraction_of_infected (bool, optional):
            If 'show_fraction' is True, then this parameter determines what the divisor is when
            creating the fraction.  If it is True, it will divide the number of people on ART
            by the number of infected people.  If it is False, the divisor will the total population
            with that stratification.

        img_dir (str, optional):
            Directory to save the images. If None, the images will not be saved and a window will be opened.

    Returns:
        None - but image will be saved or window opened.
    """
    base_plot_by_age(base_title="On ART",
                     main_column_name=COL_NAME_ON_ART,
                     dir_or_filename=dir_or_filename,
                     exp_dir_or_filename=exp_dir_or_filename,
                     node_id=node_id,
                     gender=gender,
                     age_bins=age_bin_list,
                     show_avg_per_run=show_avg_per_run,
                     show_fraction=show_fraction,
                     fraction_of=fraction_of_infected,
                     fraction_of_column_name=COL_NAME_INFECTED,
                     fraction_of_str="Infected ",
                     img_dir=img_dir)




[docs]
def plot_population_by_age(dir_or_filename: str,
                           exp_dir_or_filename: str = None,
                           node_id: int = None,
                           gender: str = None,
                           age_bin_list: list[float] = None,
                           show_avg_per_run: bool = False,
                           img_dir: str = None):
    """
    Create a plot showing the population over time.

    Args:
        dir_or_filename (str, required):
            The directory or filename containing the ReportHIVByAgeAndGender.csv files.

        exp_dir_or_filename (str, required):
            The expected or alternate directory or filename containing the ReportHIVByAgeAndGender.csv files.

        node_id (int, optional):
            The ID of the node for which the data is being filtered for.

        gender (str, optional):
            The string (Male or Female) for the gender that data is being filtered for.

        age_bin_list (list[float], optional):
            A list of ages in years where the population with risk value will be counted
            for each bin.  For example, if you enter [10, 25, 30, 55], there will be three
            age bins with the following ranges: [10-25), [25-30), [30-55)

        show_avg_per_run (bool, optional):
            If 'dir_or_filename' is a directory, this will calculate the average number of
            people with the given risk type at a given time step between the files.
            Default is False.

        img_dir (str, optional):
            Directory to save the images. If None, the images will not be saved and a window will be opened.

    Returns:
        None - but image will be saved or window opened.
    """
    if age_bin_list is not None and len(age_bin_list) == 0:
        raise ValueError("The 'age_bin_list' parameter must be a list of ages in years where the population will be counted for each bin. "
                         + "If you do not want to use age bins, please set it to None.")

    base_plot_by_age(base_title="",
                     main_column_name=COL_NAME_POP,
                     dir_or_filename=dir_or_filename,
                     exp_dir_or_filename=exp_dir_or_filename,
                     node_id=node_id,
                     gender=gender,
                     age_bins=age_bin_list,
                     show_avg_per_run=show_avg_per_run,
                     show_fraction=False,
                     fraction_of=False,
                     fraction_of_column_name=None,
                     fraction_of_str=None,
                     img_dir=img_dir)




[docs]
def plot_vmmc_by_age(dir_or_filename: str,
                     exp_dir_or_filename: str = None,
                     node_id: int = None,
                     age_bin_list: list[float] = None,
                     show_avg_per_run: bool = False,
                     img_dir: str = None):
    """
    Create a plot showing information about the men who are circumcised.

    Args:
        dir_or_filename (str, required):
            The directory or filename containing the ReportHIVByAgeAndGender.csv files.

        exp_dir_or_filename (str, required):
            The expected or alternate directory or filename containing the ReportHIVByAgeAndGender.csv files.

        node_id (int, optional):
            The ID of the node for which the data is being filtered for.

        gender (str, optional):
            The string (Male or Female) for the gender that data is being filtered for.

        age_bin_list (list[float], optional):
            A list of ages in years where the population with risk value will be counted
            for each bin.  For example, if you enter [10, 25, 30, 55], there will be three
            age bins with the following ranges: [10-25), [25-30), [30-55)

        show_avg_per_run (bool, optional):
            If 'dir_or_filename' is a directory, this will calculate the average number of
            people with the given risk type at a given time step between the files.
            Default is False.

        img_dir (str, optional):
            Directory to save the images. If None, the images will not be saved and a window will be opened.

    Returns:
        None - but image will be saved or window opened.
    """
    base_plot_by_age(base_title="VMMC",
                     main_column_name=COL_NAME_POP,
                     dir_or_filename=dir_or_filename,
                     exp_dir_or_filename=exp_dir_or_filename,
                     node_id=node_id,
                     gender="Male",
                     age_bins=age_bin_list,
                     show_avg_per_run=show_avg_per_run,
                     show_fraction=False,
                     fraction_of=False,
                     fraction_of_column_name="",
                     fraction_of_str=None,
                     filter_by_hiv_negative=True,
                     other_strat_column_name=COL_NAME_IS_CIRC,
                     other_strat_value_a=1,  # Only show circumcised
                     other_strat_value_b=0,  # Not circumcised
                     img_dir=img_dir)




[docs]
def plot_population_by_age_vs_unworld_pop(filename: str,
                                          unworld_pop_filename: str,
                                          age_bin_list: list[float],
                                          country: str,
                                          version: str,
                                          x_base_population: float = 1.0,
                                          img_dir=None):
    """
    For a single file, plot the actual population for a given age bin versus what was expected
    in the given UN World Population file.  If you have define three age bins, there will be
    six curves -  an actual and an expected for each bin.

    Args:
        filename (str, required):
            The name and path of the ReportHIVByAgeAndGender.csv file to plot the data from.

        unworld_pop_filename (str, required):
            The name and path to a UN World Pop Excel spreadsheet where the 'country' parameter specifies
            a country name found in the spreadsheet and the 'version' specifies the year of the data.
            These values are needed to know how to read the data in the spreadsheet.

        age_bin_list (list[float], required):
            A list of ages in years where the population with risk value will be counted
            for each bin.  For example, if you enter [10, 25, 30, 55], there will be three
            age bins with the following ranges: [10-25), [25-30), [30-55)

        country (str, required):
            The name of the country found in the spreadsheet to extract the data for.

        version (str, required):
            The year associated with when the UN World Pop file was created.
            PLEASE NOTE: This year is a string.

        x_base_population (float, optional):
            The 'x_Base_Population' value (found in the config) is used to divide by the population
            numbers in the CSV file so you get numbers that match the true population.

        img_dir (str, optional):
            Directory to save the images. If None, the images will not be saved and a window will be opened.

    Returns:
        None - but image will be saved or window opened.
    """
    if not os.path.isfile(filename):
        raise ValueError(f"The filename, '{filename}' given does not appear to be a file.")

    df = pd.read_csv(filename)

    if COL_NAME_GENDER not in df.columns:
        raise ValueError(f"'{COL_NAME_GENDER}' column does not exist in the file({filename}).")
    if COL_NAME_AGE not in df.columns:
        raise ValueError(f"'{COL_NAME_AGE}' column does not exist in the file({filename}).")
    if COL_NAME_POP not in df.columns:
        raise ValueError(f"'{COL_NAME_POP}' column does not exist in the file({filename}).")

    ages = df[COL_NAME_AGE].unique()

    pv = df.pivot_table(index=COL_NAME_YEAR,
                        columns=[COL_NAME_AGE],
                        values=COL_NAME_POP,
                        aggfunc="sum")

    df2 = pd.DataFrame()
    df2.index = pv.index

    for index in range(len(age_bin_list) - 1):
        label = f"Population: {age_bin_list[index]}-{age_bin_list[index + 1]}"
        pv[label] = 0
        for age in ages:
            if (age_bin_list[index] <= age) and (age < age_bin_list[index + 1]):
                pv[label] = pv[label] + pv[(age)]
        df2[label] = pv[label] / x_base_population

    years = df2.index.values.astype(int).tolist()
    years = list(set(years))

    unwp_df = unwp.extract_population_by_age(country=country,
                                             version=version,
                                             years=years,
                                             filename=unworld_pop_filename)
    unwp_df.index = unwp_df[COL_NAME_YEAR]

    unwp_df2 = pd.DataFrame()
    unwp_df2.index = unwp_df.index

    for index in range(len(age_bin_list) - 1):
        name_old = str(age_bin_list[index])
        name_new = f"Expected Population: {age_bin_list[index]}-{age_bin_list[index + 1]}"
        unwp_df2[name_new] = unwp_df[name_old]

    unwp_df2.index = unwp_df2.index.astype(float)

    xy_plot.xy_plot(img_dir=img_dir,
                    df=df2,
                    expected_df=unwp_df2,
                    title_1="Population by Age",
                    title_2=f"UN World Population - {country} - {version}",
                    y_axis_name="Number of People",
                    fraction_of_total=False,
                    min_x=None, max_x=None, min_y=None, max_y=None,
                    x_axis_as_log_scale=False,
                    y_axis_as_log_scale=False)




[docs]
def plot_risk(dir_or_filename: str,
              starting_expected_values: dict = None,
              expected_value_for_high_per_node: list[float] = None,
              gender: str = None,
              age_bin_list: list[float] = None,
              show_avg_per_run: bool = False,
              show_fraction: bool = False,
              img_dir: str = None):
    """
    Create one plot for each node in 'expected_value_for_high_per_node' showing the risk
    values for the population versus what is expected.

    Args:
        dir_or_filename (str, required):
            The directory or filename containing the ReportHIVByAgeAndGender.csv files.

        starting_expected_values (dict, optional):
            The starting three values of how risk distributed to the population.  These
            should be the same values that you have in the demographics.  The order is
            LOW, MEDIUM, and HIGH.  Typically, one might have 0.85 for LOW, 0.15 for MEDIUM,
            and 0.0 for HIGH because people get set to HIGH in the campaign's CSW logic.

        expected_value_for_high_per_node (list[float, optional]):
            A list of expected fraction of the population to have Risk = HIGH for a given node.
            The node ID of each value is expected to be the index of the position plus 1.
            The starting values for LOW and MEDIUM are adjusted for this HIGH value.

        gender (str, optional):
            The string (Male or Female) for the gender that data is being filtered for.

        age_bin_list (list[float], optional):
            A list of ages in years where the population with risk value will be counted
            for each bin.  For example, if you enter [10, 25, 30, 55], there will be three
            age bins with the following ranges: [10-25), [25-30), [30-55)

        show_avg_per_run (bool, optional):
            If 'dir_or_filename' is a directory, this will calculate the average number of
            people with the given risk type at a given time step between the files.
            Default is False.

        show_fraction (bool, optional):
            True indicates that for each stratification the number of people with a given
            risk value is divided by the total number of people in that stratification.

        img_dir (str, optional):
            Directory to save the images. If None, the images will not be saved and a window will be opened.

    Returns:
        None - but image will be saved or window opened.
    """

    if ((starting_expected_values is not None) and (expected_value_for_high_per_node is     None) or    # noqa: E271, W504
        (starting_expected_values is     None) and (expected_value_for_high_per_node is not None)):     # noqa: E225, E271, E129
        raise ValueError("'starting_expected_values' and 'expected_value_for_high_per_node'" +          # noqa: W504
                         " need to be either both None or both defined.")

    if (starting_expected_values is not None) and (starting_expected_values["HIGH"] != 0.0):
        raise ValueError("Expected 'starting_expected_values['HIGH'] to be zero.")

    if (starting_expected_values is not None) and (expected_value_for_high_per_node is not None):
        node_id = 0
        for high_value in expected_value_for_high_per_node:
            node_id = node_id + 1

            expected_values = None
            if (starting_expected_values is not None) and (expected_value_for_high_per_node is not None):
                expected_values = {}
                expected_values["LOW"   ] = starting_expected_values["LOW"   ] * (1.0 - high_value) # noqa: E202
                expected_values["MEDIUM"] = starting_expected_values["MEDIUM"] * (1.0 - high_value)
                expected_values["HIGH"  ] = high_value # noqa: E202

            plot_population_by_ip(dir_or_filename=dir_or_filename,
                                  node_id=node_id,
                                  gender=gender,
                                  age_bin_list=age_bin_list,
                                  ip_key="Risk",
                                  ip_values=["LOW", "MEDIUM", "HIGH"],
                                  show_avg_per_run=show_avg_per_run,
                                  show_fraction=show_fraction,
                                  expected_values=expected_values,
                                  img_dir=img_dir)
    else:
        plot_population_by_ip(dir_or_filename=dir_or_filename,
                              node_id=None,
                              gender=gender,
                              age_bin_list=age_bin_list,
                              ip_key="Risk",
                              ip_values=["LOW", "MEDIUM", "HIGH"],
                              show_avg_per_run=show_avg_per_run,
                              show_fraction=show_fraction,
                              expected_values=None,
                              img_dir=img_dir)




[docs]
def plot_vmmc_for_dir(dir_or_filename: str,
                      node_id: int = None,
                      age_bin_list: list[float] = None,
                      show_expected: bool = False,
                      show_avg_per_run: bool = False,
                      img_dir: str = None):
    """
    Create a plot showing what fraction of men are circumcised over time versus the
    expected number of circumcised men.

    Args:
        dir_or_filename (str, required):
            The directory or filename containing the ReportHIVByAgeAndGender.csv files.

        node_id (int, optional):
            The ID of the node for which the data is being filtered for.

        age_bin_list (list[float], optional):
            A list of ages in years where the population with risk value will be counted
            for each bin.  For example, if you enter [10, 25, 30, 55], there will be three
            age bins with the following ranges: [10-25), [25-30), [30-55)

        show_expected (bool, optional):
            If true, plot the expected fraction of circumcisions.

        show_avg_per_run (bool, optional):
            If 'dir_or_filename' is a directory, this will calculate the average number of
            people with the given risk type at a given time step between the files.
            Default is False.

        img_dir (str, optional):
            Directory to save the images. If None, the images will not be saved and a window will be opened.

    Returns:
        None - but image will be saved or window opened.
    """
    if node_id is None and show_expected:
        raise ValueError("You need to specify 'node_id' if you want to compare actual data against the expected values.")

    dir_filenames = helpers.get_filenames(dir_or_filename=dir_or_filename,
                                          file_prefix="ReportHIVByAgeAndGender",
                                          file_extension=".csv")

    combined_df = pd.DataFrame()
    for fn in dir_filenames:
        df_circ, not_used = extract_population_data_multiple_ages(fn,
                                                                  node_id=node_id,
                                                                  gender="Male",
                                                                  age_bin_list=age_bin_list,
                                                                  other_strat_column_name=COL_NAME_IS_CIRC,
                                                                  other_strat_value=1)
        df_un_circ, not_used = extract_population_data_multiple_ages(fn,
                                                                     node_id=node_id,
                                                                     gender="Male",
                                                                     age_bin_list=age_bin_list,
                                                                     other_strat_column_name=COL_NAME_IS_CIRC,
                                                                     other_strat_value=0)
        if len(combined_df.columns) == 0:
            combined_df.index = df_circ.index
            for column_name in df_circ.columns:
                name = column_name
                if not show_avg_per_run:
                    name = fn + "-" + name
                combined_df[name] = 0
        for column_name in df_circ.columns:
            if show_avg_per_run:
                combined_df[column_name] = combined_df[column_name] + df_circ[column_name] / (df_circ[column_name] + df_un_circ[column_name])
            else:
                name = fn + "-" + column_name
                combined_df[name] = df_circ[column_name] / (df_circ[column_name] + df_un_circ[column_name])

    if show_avg_per_run:
        for column_name in combined_df.columns:
            combined_df[column_name] = combined_df[column_name] / len(dir_filenames)

    traditional_coverage_per_node = [0.054978651, 0.139462861, 0.028676043, 0.091349358, 0.123187070,
                                     0.039308099, 0.727917322, 0.041105263, 0.044388102, 0.398239794]
    rtec_x1 = 2016
    rtec_x2 = 2021
    rtec_y1 = 0.54
    rtec_y2 = 0.9

    expected_df = None
    if show_expected:
        expected_df = pd.DataFrame()
        expected_df.index = combined_df.index
        expected_df["Expected Traditional"] = traditional_coverage_per_node[node_id - 1]
        years = combined_df.index.values.astype(float).tolist()
        rtec = []
        for year in years:
            if year < rtec_x1:
                rtec.append(0)
            elif year > rtec_x2:
                rtec.append(rtec_y2)
            else:
                rtec.append(rtec_y1 + (year - rtec_x1) * (rtec_y2 - rtec_y1) / (rtec_x2 - rtec_x1))
        expected_df["Expected Plus Medical"] = rtec

    title = "Circumcised Men"
    if show_avg_per_run:
        title = title + " - Average"
    if node_id is not None:
        title = title + " - Node " + str(node_id)

    title2 = None
    if TEST_include_dir_or_filename:
        title2 = dir_or_filename

    xy_plot.xy_plot(img_dir=img_dir,
                    df=combined_df,
                    expected_df=expected_df,
                    title_1=title,
                    title_2=title2,
                    y_axis_name="Fraction of Circumcised Men",
                    fraction_of_total=False,
                    show_legend=show_avg_per_run,
                    show_markers=show_avg_per_run,
                    min_x=None, max_x=None, min_y=None, max_y=None,
                    x_axis_as_log_scale=False,
                    y_axis_as_log_scale=False)




[docs]
def plot_prevalence_for_dir(dir_or_filename: str,
                            exp_dir_or_filename: str = None,
                            node_id: int = None,
                            gender: str = None,
                            age_bin_list: list[float] = None,
                            show_avg_per_run: bool = False,
                            show_fraction: bool = False,
                            img_dir: str = None):
    """
    Create a plot showing who is infected with HIV.

    Args:
        dir_or_filename (str, required):
            The directory or filename containing the ReportHIVByAgeAndGender.csv files.

        exp_dir_or_filename (str, required):
            The expected or alternate directory or filename containing the ReportHIVByAgeAndGender.csv files.

        node_id (int, optional):
            The ID of the node for which the data is being filtered for.

        gender (str, optional):
            The string (Male or Female) for the gender that data is being filtered for.

        age_bin_list (list[float], optional):
            A list of ages in years where the population with risk value will be counted
            for each bin.  For example, if you enter [10, 25, 30, 55], there will be three
            age bins with the following ranges: [10-25), [25-30), [30-55)

        show_avg_per_run (bool, optional):
            If 'dir_or_filename' is a directory, this will calculate the average number of
            people with the given risk type at a given time step between the files.
            Default is False.

        show_fraction (bool, optional):
            True indicates that the number of infected people should be divided by the total
            number of people in that stratification.

        img_dir (str, optional):
            Directory to save the images. If None, the images will not be saved and a window will be opened.

    Returns:
        None - but image will be saved or window opened.
    """
    base_plot_by_age(base_title="Infected",
                     main_column_name=COL_NAME_INFECTED,
                     dir_or_filename=dir_or_filename,
                     exp_dir_or_filename=exp_dir_or_filename,
                     node_id=node_id,
                     age_bins=age_bin_list,
                     gender=gender,
                     show_avg_per_run=show_avg_per_run,
                     show_fraction=show_fraction,
                     fraction_of=False,
                     fraction_of_column_name="",
                     fraction_of_str="",
                     img_dir=img_dir)




[docs]
def plot_risk_zambia(dir_or_filename: str,
                     age_bin_list: list[float] = None,
                     show_avg_per_run: bool = False,
                     show_fraction: bool = False,
                     show_expected: bool = False,
                     img_dir: str = None):
    """
    Create multiple risk value plots where each plot is for a specific node and gender.
    The plot can show the count or fraction of the group that has one of the three
    risk values.  The plot can also show the expected values for specific node and gender
    for Zambia.

    Args:
        dir_or_filename (str, required):
            The directory or filename containing the ReportHIVByAgeAndGender.csv files.

        age_bin_list (list[float], optional):
            A list of ages in years where the population with risk value will be counted
            for each bin.  For example, if you enter [10, 25, 30, 55], there will be three
            age bins with the following ranges: [10-25), [25-30), [30-55)

        show_avg_per_run (bool, optional):
            If 'dir_or_filename' is a directory, this will calculate the average number of
            people with the given risk type at a given time step between the files.
            Default is False.

        show_fraction (bool, optional):
            True indicates that the data is not true counts but a fraction
            (i.e. a count divided by another counter)

        show_expected (bool, optional):
            True indicates that you want to see the expected fractions of the population that
            have the different risk values PER NODE.  False will be data for all nodes.

        img_dir (str, optional):
            Directory to save the images. If None, the images will not be saved and a window will be opened.

    Returns:
        None - but image will be saved or window opened.
    """
    starting_expected_values = {
        "LOW": 0.85,
        "MEDIUM": 0.15,
        "HIGH": 0.0
    }
    node_to_high = []
    gender_list = ["Female", "Male"]
    for gender in gender_list:
        if gender == "Female":
            # node_to_high = [0.125, 0.125, 0.125, 0.0500, 0.0500, 0.0500, 0.125, 0.0500, 0.125, 0.0500]
            node_to_high = [0.125, 0.125, 0.125, 0.0500]
        else:
            # node_to_high = [0.195, 0.195, 0.195, 0.0781, 0.0781, 0.0781, 0.195, 0.0781, 0.195, 0.0781]
            node_to_high = [0.195, 0.195, 0.195, 0.0781]

        if not show_expected:
            starting_expected_values = None
            node_to_high = None
        plot_risk(dir_or_filename=dir_or_filename,
                  starting_expected_values=starting_expected_values,
                  expected_value_for_high_per_node=node_to_high,
                  gender=gender,
                  age_bin_list=age_bin_list,
                  show_avg_per_run=show_avg_per_run,
                  show_fraction=show_fraction,
                  img_dir=img_dir)



if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()

    parser.add_argument('dir_or_filename', type=str, nargs=1, help='A directory with ReportHivByAgeAndGender.csv files or a single file.')
    parser.add_argument('exp_dir_or_filename', type=str, default=None, nargs='?', help='A directory with ReportHivByAgeAndGender.csv files or a single file to compare to.')
    parser.add_argument('-p', '--plot', default='population', help='Options: population, prevalence, risk, vmmc, art, state, column, summary')
    parser.add_argument('-o', '--output', default=None, help='If provided, a directory will be created and images saved to the folder.  If not provided, it opens windows.')
    parser.add_argument('-n', '--node_id', type=int, default=None, help='Plot the data for a specific node.')
    parser.add_argument('-m', '--mean', help='Gives the average/mean of each run at that time point.', action='store_true')
    parser.add_argument('-a', '--ages', help='Show the data stratified by age.', action='store_true')
    parser.add_argument('-x', '--expected', help='Show the expected data.', action='store_true')
    parser.add_argument('-c', '--column_name', default='Died', help='Only used with plot=column.  The name of a non-stratification column from the report.  Do not include the space before the name.')

    args = parser.parse_args()

    dir_or_filename = args.dir_or_filename[0]
    exp_dir_or_filename = None
    if args.exp_dir_or_filename is not None:
        exp_dir_or_filename = args.exp_dir_or_filename

    node_id = args.node_id

    plot_list = []
    if args.plot == "summary":
        plot_list = ["population", "prevalence", "vmmc", "art", "state"]
    else:
        plot_list = [args.plot]

    for plot in plot_list:
        if plot == "population":
            if args.expected:
                print("===============================================")
                print("Prevalence plot does not support expected data.")
                print("===============================================")

            age_bin_list = None
            if args.ages:
                age_bin_list = [0, 15, 25, 35, 45, 55, 100]

            plot_population_by_age(dir_or_filename=dir_or_filename,
                                   exp_dir_or_filename=exp_dir_or_filename,
                                   node_id=node_id,
                                   gender=None,
                                   age_bin_list=age_bin_list,
                                   show_avg_per_run=args.mean,
                                   img_dir=args.output)

        elif plot == "prevalence":
            if args.expected:
                print("===============================================")
                print("Prevalence plot does not support expected data.")
                print("===============================================")

            age_bin_list = None
            if args.ages:
                age_bin_list = [15, 25, 35, 45, 55]

            plot_prevalence_for_dir(dir_or_filename=dir_or_filename,
                                    exp_dir_or_filename=exp_dir_or_filename,
                                    node_id=args.node_id,
                                    gender=None,
                                    age_bin_list=age_bin_list,
                                    show_avg_per_run=args.mean,
                                    show_fraction=False,
                                    img_dir=args.output)

        elif plot == "risk":
            if args.node_id is not None:
                print("===============================================")
                print("Population plot does not support node_id.")
                print("===============================================")

            age_bin_list = None
            if args.ages:
                age_bin_list = [15, 35, 55]

            plot_population_by_ip(dir_or_filename=dir_or_filename,
                                  exp_dir_or_filename=exp_dir_or_filename,
                                  node_id=args.node_id,
                                  ip_key="Risk",
                                  age_bin_list=age_bin_list,
                                  show_avg_per_run=args.mean,
                                  show_fraction=False,
                                  img_dir=args.output)
            # plot_risk_zambia(dir_or_filename=dir_or_filename,
            #                  age_bin_list=age_bin_list,
            #                  show_avg_per_run=args.mean,
            #                  show_fraction=True,
            #                  show_expected=args.expected,
            #                  img_dir=args.output)

        elif plot == "vmmc":
            age_bin_list = None
            if args.ages:
                age_bin_list = [15, 20, 25, 30, 35, 40, 45, 50, 55]

            plot_vmmc_by_age(dir_or_filename=dir_or_filename,
                             exp_dir_or_filename=exp_dir_or_filename,
                             node_id=args.node_id,
                             age_bin_list=age_bin_list,
                             show_avg_per_run=args.mean,
                             img_dir=args.output)
            # plot_vmmc_for_dir(dir_or_filename=dir_or_filename,
            #                   node_id=args.node_id,
            #                   age_bin_list=age_bin_list,
            #                   show_expected=args.expected,
            #                   show_avg_per_run=args.mean,
            #                   img_dir=args.output)

        elif plot == "art":
            if args.expected:
                print("========================================")
                print("ART plot does not support expected data.")
                print("========================================")

            age_bin_list = None
            if args.ages:
                age_bin_list = [0, 15, 50, 75, 100]

            plot_onART_by_age(dir_or_filename=dir_or_filename,
                              exp_dir_or_filename=exp_dir_or_filename,
                              node_id=args.node_id,
                              gender=None,
                              age_bin_list=age_bin_list,
                              show_avg_per_run=args.mean,
                              show_fraction=True,
                              fraction_of_infected=True,
                              img_dir=args.output)

        elif plot == "state":
            if args.expected:
                print("==========================================")
                print("State plot does not support expected data.")
                print("==========================================")

            age_bin_list = None
            if args.ages:
                age_bin_list = [15, 30, 45]

            plot_population_by_ip(dir_or_filename=dir_or_filename,
                                  exp_dir_or_filename=exp_dir_or_filename,
                                  node_id=args.node_id,
                                  ip_key="CascadeState",
                                  age_bin_list=age_bin_list,
                                  show_avg_per_run=args.mean,
                                  show_fraction=False,
                                  img_dir=args.output)

        elif plot == "column":
            plot_columns(filename=dir_or_filename,
                         title=args.column_name,
                         y_axis_name="Count",
                         column_names=[" " + args.column_name],
                         img_dir=args.output)

        else:
            raise ValueError(f"Plot type '{args.plot}' is not supported.")