Collect all the time series data for a given channel for a given experiment from InsetChart.json
files in local subdirectory that have been downoaded from COMPS, assuming following structure.
| exp_id/
sim_id/
InsetChart.json
|
Parameters:
| Name |
Type |
Description |
Default |
exp_id
|
str
|
Experiment Id that has had data downloaded to current working diretory.
|
required
|
chan
|
str
|
|
'Infected'
|
tag
|
str
|
key=value. Using results.db (sqlite3, from emodpy), limit results to just where key=value.
If value is set to SWEEP, find all values for key and plot all values separately (but with mean/spread from other tags).
|
None
|
Returns:
| Type |
Description |
dict
|
Array of channel data for further processing.
|
Source code in emod_api/channelreports/plot_icj_means.py
| def collect(exp_id: str,
chan: str = "Infected",
tag: str = None,
smoothing: bool = True) -> dict:
"""
Collect all the time series data for a given channel for a given experiment from InsetChart.json
files in local subdirectory that have been downoaded from COMPS, assuming following structure.
exp_id/
sim_id/
InsetChart.json
Args:
exp_id: Experiment Id that has had data downloaded to current working diretory.
chan: Channel name
tag: key=value. Using results.db (sqlite3, from emodpy), limit results to just where key=value.
If value is set to SWEEP, find all values for key and plot all values separately (but with mean/spread from other tags).
Returns:
Array of channel data for further processing.
"""
chan_data = {}
groupby_values = {}
if tag:
if len(tag.split("=")) == 1:
raise ValueError("When passing tag, has to have key=value format.")
groupby_key = tag.split("=")[0]
groupby_value = tag.split("=")[1]
db = os.path.join("latest_experiment", "results.db")
con = sqlite3.connect(db)
cur = con.cursor()
if groupby_value == "SWEEP":
query = f"SELECT sim_id, {groupby_key} FROM results"
all_results = cur.execute(query)
for result in all_results:
sim_id = result[0]
groupby_value = result[1]
if groupby_value not in groupby_values:
groupby_values[groupby_value] = list()
groupby_values[groupby_value].append(sim_id)
else: # select only sim_id's where gb key == value
query = f"SELECT sim_id FROM results where {groupby_key} = {groupby_value}"
all_results = cur.execute(query)
groupby_values["ref"] = list()
for result in all_results:
sim_id = result[0]
groupby_values["ref"].append(sim_id)
else:
groupby_values["ref"] = os.listdir(exp_id)
groupby_values["ref"].remove("results.db")
def moving_average(x, w=7):
return np.convolve(x, np.ones(w), 'valid') / w
max_len = 0
# poi = param of interest
for value in groupby_values:
simdirs = groupby_values[value]
for sim in simdirs:
thedir = os.path.join(exp_id, sim)
if value not in chan_data:
chan_data[value] = []
if not os.path.exists(thedir + "/InsetChart.json"):
continue
with open(thedir + "/InsetChart.json") as fp:
icj = json.loads(fp.read())
if chan not in icj["Channels"]:
raise ValueError(f"Can't find channel {chan} in file. Did find {icj['Channels'].keys()}.")
new_data = np.asarray(icj["Channels"][chan]["Data"])
if smoothing:
new_data = moving_average(new_data)
chan_data[value].append(new_data)
if len(new_data) > max_len:
max_len = len(new_data)
if max_len == 0:
raise ValueError(f"No InsetChart.json files with channel data for {chan} and experiment {exp_id}.")
"""
If users run simulations that end when prevalence is zero, the length of the time series can vary
We need to get them all the same to calc the mean.
"""
data_for_plotting = {}
for poi in chan_data:
data_for_plotting[poi] = []
for data in chan_data[poi]:
if len(data) < max_len:
data = np.pad(data, (0, max_len - len(data)))
data_for_plotting[poi].append(data)
return data_for_plotting
|