run_observational_model

`extract_sampled_infections(sample_df)`

Remove rows where all sampling columns ('rep' columns) are NaN.

Parameters:

Name	Type	Description	Default
`sampled_df`	`DataFrame`	DataFrame with sampling columns	required

Returns:

Type	Description
	pd.DataFrame: DataFrame with rows removed where all sampling columns are NaN

Source code in fpg_observational_model/run_observational_model.py

def extract_sampled_infections(sample_df):
    """
    Remove rows where all sampling columns ('rep' columns) are NaN.

    Parameters:
        sampled_df (pd.DataFrame): DataFrame with sampling columns

    Returns:
        pd.DataFrame: DataFrame with rows removed where all sampling columns are NaN
    """
    # String to match in column names
    match_string = 'rep'

    # Identify columns matching the string
    sample_cols = sample_df.filter(regex=match_string)

    if sample_cols.empty:
        print("No 'rep' columns found - returning original DataFrame")
        return sample_df

    # Find rows where ALL sampling columns are NaN
    rows_all_nan = sample_cols.isna().all(axis=1)
    rows_to_drop = rows_all_nan.sum()

    print(f"Found {len(sample_cols.columns)} sampling columns: {sample_cols.columns.tolist()}")
    print(f"Dropping {rows_to_drop} rows where all sampling columns are NaN")

    # Keep rows where at least one sampling column has a value
    df_cleaned = sample_df[~rows_all_nan]

    return df_cleaned

`get_default_config()`

Return the default observational model configuration.

Source code in fpg_observational_model/run_observational_model.py

def get_default_config():
    """Return the default observational model configuration."""
    observational_model_config = {
        'hard_filters': {
            'symptomatics_only': True, 
            'monogenomic_infections_only': False,
            'day_snapshot': False
        },
        'intervention_start_month': 29, # Provide month where an intervention is applied. Currently any sampling pre/post intervention for a single intervention is supported. 
        'sampling_configs': {
            'random': {
                'method': 'random',
                'n_samples_year': 100,
                'replicates': 2,
                'method_params': {
                    'population_proportions': [1, 0], # Use to sample from the source or sink only, equally, etc. Within population comparisons of genetic metrics can be specified below - just make sure to total number of samples per year * proportion reflects the numbers you want per population.
                    'monogenomic_proportion': False, # Set to False if sampling randomly 
                    'equal_monthly': False}
            },
            'seasonal': {
                'method': 'seasonal',
                'n_samples_year': 100,
                'replicates': 2,
                'method_params': {
                    'season': 'full', # Options: full or peak; currently hardcoded to match Senegal's seasonality; update for other scenarios in unified_sampling.py
                }
            },
            # 'age': { # Example of how to set-up a sampling scheme based on age, to mirror biased sampling such as school surveys and health facility comparisons. 
            #     'method': 'age',
            #     'n_samples_year': 15,
            #     'replicates': 1
            # }

        },
        'metrics': {
            'cotransmission_proportion': True,
            'complexity_of_infection': True, # Will calculate both true COI and effective COI from the unique strains identified in the sample.
            'heterozygosity': True,
            'identity_by_descent': False,
            'identity_by_state': True,
            'individual_ibx': True,
            'fws': True,
            'monogenomic_proportion': True,
            'rh': True,
            'unique_genome_proportion': True # Will calculate both the proportion of unique genomes in the sampled infections to replicate phasing and from monogenomic samples with an effective COI of 1  only to match barcode limits.
        },
        'subpopulation_comparisons': { # Supported for yearly and seasonal temporal sampling schemes, not age-based sampling. 
            'add_monthly': False,  # Whether to add monthly comparisons within each year
            'populations': False,  # Defined by the population node in EMOD
            'polygenomic': True,  # Is polygenomic = 1, else monogenomic = 0
            'symptomatic': False,  # Is symptomatic = 1, else asymptomatic = 0
            'age_bins': False     # Default age bins: 0-5, 5-15, 15+
        }
    }

    return observational_model_config

`load_matrix_safely(file_path, max_retries=3, use_local_copy=True)`

Safely load numpy matrix with memory mapping using raw reconstruction.

Source code in fpg_observational_model/run_observational_model.py

def load_matrix_safely(file_path, max_retries=3, use_local_copy=True):
    """
    Safely load numpy matrix with memory mapping using raw reconstruction.
    """
    import os
    import numpy as np
    import tempfile

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return None

    print(f"Loading matrix with mmap from: {file_path}")

    # Method 1: Try standard mmap first
    try:
        array = np.load(file_path, mmap_mode='r')
        print(f"Successfully loaded with standard mmap, shape: {array.shape}")
        return array
    except Exception as e1:
        print(f"Standard mmap failed: {e1}")

    # Method 2: Raw reconstruction with manual mmap
    try:
        print(f"Method 2: Raw reconstruction with mmap")
        with open(file_path, 'rb') as f:
            # Read header info
            magic = f.read(6)
            if magic != b'\x93NUMPY':
                raise ValueError("Not a numpy file")

            major, minor = f.read(2)

            if major == 1:
                header_len = np.frombuffer(f.read(2), dtype=np.uint16)[0]
            else:
                header_len = np.frombuffer(f.read(4), dtype=np.uint32)[0]

            header = f.read(header_len).decode('latin1')

            # Clean up header
            import ast
            import re
            header_clean = re.sub(r'\s+', ' ', header.strip())
            header_dict = ast.literal_eval(header_clean)

            shape = header_dict['shape']
            dtype = header_dict['descr']
            fortran_order = header_dict['fortran_order']

            # Calculate data offset
            data_offset = f.tell()

        print(f"Parsed - Shape: {shape}, Dtype: {dtype}, Data offset: {data_offset}")

        # Create memory-mapped array directly from file
        array = np.memmap(file_path, dtype=dtype, mode='r', 
                         offset=data_offset, shape=shape,
                         order='F' if fortran_order else 'C')

        print(f"Successfully created memmap with shape: {array.shape}")
        return array

    except Exception as e2:
        print(f"Mmap reconstruction failed: {e2}")

        # Fallback: Load into memory then create temp mmap file
        try:
            print("Fallback: Creating temporary mmap file")
            # First load the data using Method 3 from before
            with open(file_path, 'rb') as f:
                # ... same header parsing as above ...
                f.seek(data_offset)
                data = np.frombuffer(f.read(), dtype=dtype)

                if fortran_order:
                    array = data.reshape(shape, order='F')
                else:
                    array = data.reshape(shape, order='C')

            # Create a temporary file for memory mapping
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.npy')
            np.save(temp_file.name, array)
            temp_file.close()

            # Load as mmap from temp file
            mmap_array = np.load(temp_file.name, mmap_mode='r')

            # Store temp filename for cleanup (you'd need to handle this)
            mmap_array._temp_file = temp_file.name

            print(f"Created temporary mmap file: {temp_file.name}")
            return mmap_array

        except Exception as e3:
            print(f"Temp mmap fallback failed: {e3}")

    print(f"All mmap methods failed for {file_path}")
    return None

`make_json_serializable(obj)`

Convert nested dictionary with tuple keys to JSON-serializable format

Source code in fpg_observational_model/run_observational_model.py

def make_json_serializable(obj):
    """Convert nested dictionary with tuple keys to JSON-serializable format"""
    if isinstance(obj, dict):
        return {str(k): make_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, tuple):
        return str(obj)
    else:
        return obj        

`process_file(file_row, output_summary_dir, config_path=None, verbose=False)`

Process a single file for parallel execution.

Parameters:

Name	Description	Default
`file_row`	pandas Series or dict with 'output_name' and 'input_dir' columns	required
`output_summary_dir`	Directory to save outputs	required
`config_path`	Path to config file (optional)	`None`
`verbose`	Whether to print verbose output	`False`

Returns:

Name	Type	Description
`str`		Name of processed simulation

Source code in fpg_observational_model/run_observational_model.py

def process_file(file_row, output_summary_dir, config_path=None, verbose=False):
    """
    Process a single file for parallel execution.

    Parameters:
        file_row: pandas Series or dict with 'output_name' and 'input_dir' columns
        output_summary_dir: Directory to save outputs
        config_path: Path to config file (optional)
        verbose: Whether to print verbose output

    Returns:
        str: Name of processed simulation
    """
    try:
        # Extract information from the row
        sim_name = file_row['output_name']
        emod_output_path = file_row['input_dir']

        # Use default config if not specified
        if config_path is None or not os.path.exists(config_path):
            config_path = ""  # Will trigger default config usage

        # Create output directory for this simulation
        output_path = os.path.join(output_summary_dir, sim_name)

        # Run the observational model
        result = run_observational_model(
            sim_name=sim_name,
            emod_output_path=emod_output_path,
            config_path=config_path,
            output_path=output_path,
            verbose=verbose
        )

        return f"SUCCESS: {sim_name}"

    except Exception as e:
        error_msg = f"ERROR processing {file_row.get('output_name', 'unknown')}: {str(e)}"
        if verbose:
            import traceback
            print(f"{error_msg}\n{traceback.format_exc()}")
        return error_msg