"""
This module is a convenient and efficient tool for loading results metrics
of multiple model training iterations into a single MegaScanner object.
Metrics files can be loaded from disk and plotted for comparative model analysis
and evaluation. Using this approach assumes model training results files match
those generated by spacekit.analyzer.compute.Computer class/subclass objects
and are accessible from the local disk. Primarily used by spacekit.dashboard
but can easily be repurposed for other use-cases (analyzing model performance
in Jupyter notebooks/Google Colab, for example).
"""
import os
import pandas as pd
import glob
from spacekit.analyzer.compute import ComputeBinary, ComputeMulti, ComputeRegressor
from spacekit.logger.log import Logger
from spacekit.skopes.jwst.cal.config import KEYPAIR_DATA
try:
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly import subplots
except ImportError:
go = None
ff = None
subplots = None
def check_plotly():
return go is not None
[docs]def decode_categorical(df, decoder_key):
"""Add decoded column (using "{column}_key" suffix) to dataframe.
Parameters
----------
df : pandas DataFrame
dataframe with encoded categorical column
decoder_key : dict
key-value pairs of encoding integers and strings
Returns
-------
pandas DataFrane
dataframe with additional categorical column (object dtype) decoded
back to strings based on encoding pairs passed in decoder_key.
"""
for key, pairs in decoder_key.items():
for i, name in pairs.items():
df.loc[df[key] == i, f"{key}_key"] = name
return df
[docs]def import_dataset(filename=None, kwargs=dict(index_col="ipst"), decoder_key=None):
"""Imports and loads dataset from csv file. Optionally decodes an encoded feature back into strings.
Parameters
----------
filename : str, optional
path to dataframe csv file, by default None
kwargs : dict, optional
keyword args to pass into pandas read_csv method, by default dict(index_col="ipst")
decoder_key : dict, optional
nested dict of column and key value pairs for decoding a categorical feature into strings., by default None
Returns
-------
Pandas DataFrame
dataframe loaded from csv file
"""
if not os.path.exists(filename):
print("File could not be found")
# load dataset
df = pd.read_csv(filename, **kwargs)
if decoder_key:
df = decode_categorical(df, decoder_key) # adds instrument label (string)
return df
[docs]class MegaScanner:
"""
Scans local disk for Compute object datasets and results files then loads them
as attributes for use in plotting, EDA, and model evaluation.
Parameters
----------
perimeter : str, optional
glob search pattern
primary : int, optional
index of primary dataset to use for EDA in sorted list of those found, by default -1
"""
def __init__(
self, perimeter="data/20??-*-*-*", primary=-1, name="MegaScanner", **log_kws
):
self.__name__ = name
self.log = Logger(self.__name__, **log_kws).spacekit_logger()
self.perimeter = perimeter
self.datapaths = sorted(list(glob.glob(perimeter)))
self.datasets = [d.split("/")[-1] for d in self.datapaths]
self.timestamps = [
int(t.split("-")[-1]) for t in self.datasets
] # [1636048291, 1635457222, 1629663047]
self.dates = [
str(v)[:10] for v in self.datasets
] # ["2021-11-04", "2021-10-28", "2021-08-22"]
self.primary = primary
self.data = None # self.select_dataset()
self.versions = None
self.res_keys = None
self.target = None
self.labels = None
self.classes = None
self.mega = None # self.make_mega()
self.kwargs = None
self.decoder = None
self.df = None # self.load_dataframe()
self.scores = None # self.compare_scores()
self.acc_fig = None # self.accuracy_bars()
self.loss_fig = None # self.loss_bars()
self.acc_loss_figs = None # self.acc_loss_subplots()
self.res_fig = None # TODO
self.keras = {}
self.roc = {}
self.cmx = {}
if not check_plotly():
self.log.error("plotly not installed.")
raise ImportError(
"You must install plotly (`pip install plotly`) "
"for the scan module to work."
"\n\nInstall extra deps via `pip install spacekit[x]`"
)
[docs] def select_dataset(self, primary=None):
"""Select which dataset file (if there are multiple timestamps) to use, e.g. for performing EDA.
Parameters
----------
primary : int, optional
index of primary dataset to use in sorted list of those found, by default None (-1 or most recent timestamp)
Returns
-------
str
path to csv file of saved dataframe according to the primary index key of datasets found.
Raises
------
IndexError
primary index key must be a value between zero and the last index of the list of datasets.
"""
if primary:
self.primary = primary
if self.primary > len(self.datapaths):
self.log.warning("Using default index (-1)")
self.primary = -1
if len(self.datapaths) > 0:
dataset_path = self.datapaths[self.primary]
self.data = glob.glob(f"{dataset_path}/data/*.csv")[0]
return self.data
else:
return None
[docs] def make_mega(self):
"""Instantiate an empty nested dictionary of results files for each timestamp.
Returns
-------
dict
self.mega nested dictionary for storing results
"""
self.mega = {}
versions = []
for i, (d, t) in enumerate(zip(self.dates, self.timestamps)):
if self.versions is None:
v = f"v{str(i)}"
versions.append(v)
else:
v = self.versions[i]
self.mega[v] = {"date": d, "time": t, "res": self.res_keys}
if len(versions) > 0:
self.versions = versions
return self.mega
[docs] def load_compute_object(
self, Com=ComputeMulti, alg="clf", res_path="results", validation=False
):
"""Loads a single compute object of any type with results from one iteration.
Parameters
----------
Com : spacekit.analyze.compute.Computer class, optional
Compute subclass, by default ComputeMulti
alg : str, optional
algorithm type, by default "clf"
res_path : str, optional
path to results directory, by default "results"
validation : bool, optional
validation data results (no training history), by default False
Returns
-------
spacekit.analyze.compute.Computer object
Results from the given path loaded as attributes into a Compute class object
"""
if alg in ["reg", "linreg"]:
com = Com(algorithm=alg, res_path=res_path, validation=validation)
else:
com = Com(
algorithm=alg,
classes=self.labels,
res_path=res_path,
validation=validation,
)
out = com.upload()
com.load_results(out)
if alg == "clf":
try: # initialize Compute figure attrs
com.draw_plots()
except Exception as e:
self.log.error(e)
return com
def _scan_results(self, coms=[ComputeBinary], algs=["clf"], names=["test"]):
"""Scans local disk for Computer object-generated results files of model training iterations.
Returns
-------
MegaScanner.mega dictionary attribute
dictionary of model training results for each iteration found.
"""
objects = list(zip(coms, algs, names))
self.mega = self.make_mega()
for i, d in enumerate(self.datapaths):
v = self.versions[i]
for C, A, N in objects:
com = C(algorithm=A, classes=self.labels, res_path=f"{d}/results/{N}")
com_out = com.upload()
com.load_results(com_out)
self.mega[v]["res"][N] = com
return self.mega
[docs] def load_dataframe(self):
self.df = import_dataset(
filename=self.data, kwargs=self.kwargs, decoder_key=self.decoder
)
return self.df
[docs] def make_clf_plots(self, target="mem_bin"):
for v in self.versions:
self.mega[v]["res"][target].draw_plots()
self.keras[v] = [
self.mega[v]["res"][target].acc_fig,
self.mega[v]["res"][target].loss_fig,
]
self.roc[v] = [
self.mega[v]["res"][target].roc_fig,
self.mega[v]["res"][target].pr_fig,
]
# cmx for all versions displayed at once, unlike the two attrs above
self.cmx = {
"normalized": [self.mega[v]["res"][target].cmx_norm for v in self.versions],
"counts": [self.mega[v]["res"][target].cmx for v in self.versions],
}
[docs] def make_barplots(self, metric="acc_loss"):
self.compare_scores(metric=metric)
self.acc_fig = self.accuracy_bars()
self.loss_fig = self.loss_bars()
self.acc_loss_subplots()
[docs] def compare_scores(self, metric="acc_loss"):
"""Create a dictionary of model scores for multiple training iterations.
Score type depends on the type of model: classifiers typically use "acc_loss";
Regression models typically use "loss".
Parameters
----------
target : str, optional
y target class label, by default "mem_bin"
score_type : str, optional
metric used by model (clf=acc_loss, reg=loss), by default "acc_loss"
Returns
-------
Pandas dataframe
model evaluation metrics scores (accuracy/loss by default) for each model training iteration
"""
score_dfs = []
for v in self.versions:
if metric == "acc_loss":
score_dict = self.mega[v]["res"][self.target].acc_loss
else:
score_dict = self.mega[v]["res"][self.target].loss
df = pd.DataFrame.from_dict(score_dict, orient="index", columns=[v])
score_dfs.append(df)
self.scores = pd.concat([d for d in score_dfs], axis=1)
return self.scores
# TODO: this can be combined with loss_bars, use kwargs to distinguish between metrics
[docs] def accuracy_bars(self):
"""Barplots of training and test set accuracy scores loaded from a Pandas dataframe
Returns
-------
plotly.graph_objs.Figure
Grouped barplot figure data of training and test set accuracy scores.
"""
acc_train = self.scores.loc["train_acc"].values
acc_test = self.scores.loc["test_acc"].values
xvals = [c for c in self.scores.columns]
data = [
go.Bar(
x=list(range(len(acc_train))),
hovertext=xvals,
y=acc_train,
name="Training Accuracy",
marker=dict(color="dodgerblue"),
),
go.Bar(
x=list(range(len(acc_test))),
hovertext=xvals,
y=acc_test,
name="Test Accuracy",
marker=dict(color="#66c2a5"),
),
]
layout = go.Layout(
title="Accuracy",
xaxis={"title": "training iteration"},
yaxis={"title": "score"},
paper_bgcolor="#242a44",
plot_bgcolor="#242a44",
font={"color": "#ffffff"},
)
fig = go.Figure(data=data, layout=layout)
return fig
[docs] def loss_bars(self):
"""Barplots of training and test set loss scores loaded from a Pandas dataframe
Returns
-------
plotly.graph_objs.Figure
Grouped barplot figure data of training and test set loss scores.
"""
loss_train = self.scores.loc["train_loss"].values
loss_test = self.scores.loc["test_loss"].values
xvals = [c for c in self.scores.columns]
data = [
go.Bar(
x=list(range(len(loss_train))),
y=loss_train,
hovertext=xvals,
name="Training Loss",
marker=dict(color="salmon"),
),
go.Bar(
x=list(range(len(loss_test))),
y=loss_test,
hovertext=xvals,
name="Test Loss",
marker=dict(color="peachpuff"),
),
]
layout = go.Layout(
title="Loss",
xaxis={"title": "training iteration"},
yaxis={"title": "score"},
paper_bgcolor="#242a44",
plot_bgcolor="#242a44",
font={"color": "#ffffff"},
)
fig = go.Figure(
data=data,
layout=layout,
)
return fig
[docs] def acc_loss_subplots(self):
"""Side by side grouped barplots of accuracy and loss metrics for multiple model training iterations.
Returns
-------
plotly.subplots object
plot figure traces and layout for side by side Accuracy and Loss grouped barplots
"""
self.acc_loss_fig = subplots.make_subplots(
rows=1,
cols=2,
subplot_titles=("Accuracy", "Loss"),
shared_yaxes=False,
x_title="Training Iteration",
y_title="Score",
)
self.acc_loss_fig.add_trace(self.acc_fig.data[0], 1, 1)
self.acc_loss_fig.add_trace(self.acc_fig.data[1], 1, 1)
self.acc_loss_fig.add_trace(self.loss_fig.data[0], 1, 2)
self.acc_loss_fig.add_trace(self.loss_fig.data[1], 1, 2)
self.acc_loss_fig.update_layout(
title_text="Accuracy vs. Loss",
margin=dict(t=50, l=200),
paper_bgcolor="#242a44",
plot_bgcolor="#242a44",
font={
"color": "#ffffff",
},
)
return self.acc_loss_fig
[docs] def single_cmx(
self, cmx, subtitles=("v0"), zmin=0.0, zmax=1.0, cmx_type="normalized"
):
"""Confusion matrix plot for a single model training iteration
Parameters
----------
cmx : 2D numpy array
confusion matrix
zmin : int or float
typically 0 or 0.0 (minimum value for colorscale)
zmax : int
typically 1 (if normalized) or 100 (max value for colorscale)
classes : list of strings
target class labels
subtitles : tuple, optional
text to place above each plot as a subtitle, by default ("v0")
Returns
-------
plotly figure factory annotated heatmap figure
interactive confusion matrix plot
"""
x = self.labels
y = x[::-1].copy()
z = cmx[::-1]
if cmx_type == "normalized":
zmin = 0.0
zmax = 1.0
fmt = "{:.2f}"
else:
zmin = 0
zmax = 100
fmt = "{:d}"
z_text = [[fmt.format(y) for y in x] for x in z]
subplot_titles = subtitles
fig = subplots.make_subplots(
rows=1,
cols=1,
subplot_titles=subplot_titles,
shared_yaxes=False,
x_title="Predicted",
y_title="Actual",
)
fig.update_layout(
title_text="Confusion Matrix",
paper_bgcolor="#242a44",
plot_bgcolor="#242a44",
font={"color": "#ffffff"},
)
# make traces
fig1 = ff.create_annotated_heatmap(
z=z,
x=x,
y=y,
annotation_text=z_text,
colorscale="Blues",
zmin=zmin,
zmax=zmax,
)
fig.add_trace(fig1.data[0], 1, 1)
annot1 = list(fig1.layout.annotations)
annos = [annot1]
# add colorbar
fig["data"][0]["showscale"] = True
# annotation values for each square
for anno in annos:
fig.add_annotation(anno)
return fig
[docs] def triple_cmx(self, cmx, cmx_type):
"""Plot three confusion matrices side by side
Parameters
----------
cmx_type : str
"normalized" will return a normalized CMX (percentage of FNFPs), otherwise raw numeric values are displayed.
Returns
-------
plotly figure factory annotated heatmap subplots
three interactive confusion matrices side by side as a subplot
"""
if cmx_type == "normalized":
zmin = 0.0
zmax = 1.0
fmt = "{:.2f}"
else:
zmin = 0
zmax = 100
fmt = "{:d}"
x = self.labels
y = x[::-1].copy()
subplot_titles = self.versions # ("v1", "v2", "v3")
fig = subplots.make_subplots(
rows=1,
cols=3,
subplot_titles=subplot_titles,
shared_yaxes=False,
x_title="Predicted",
y_title="Actual",
)
fig.update_layout(
title_text="Confusion Matrix",
paper_bgcolor="#242a44",
plot_bgcolor="#242a44",
font={"color": "#ffffff"},
)
annos = []
for i in list(range(len(cmx))):
col = i + 1
z = cmx[i][::-1]
z_text = [[fmt.format(y) for y in x] for x in z]
cmx_fig = ff.create_annotated_heatmap(
z=z,
x=x,
y=y,
annotation_text=z_text,
colorscale="Blues",
zmin=zmin,
zmax=zmax,
)
fig.add_trace(cmx_fig.data[0], 1, col)
annot = list(cmx_fig.layout.annotations)
for k in range(len(annot)):
annot[k]["xref"] = f"x{str(col)}"
annot[k]["yref"] = f"y{str(col)}"
annos.append(annot)
new_annotations = []
for a in annos:
new_annotations.extend(a)
# add colorbar
fig["data"][0]["showscale"] = True
# annotation values for each square
for anno in new_annotations:
fig.add_annotation(anno)
return fig
[docs]class HstCalScanner(MegaScanner):
"""MegaScanner subclass for HST calibration model training iteration analysis
Parameters
----------
MegaScanner : object
Parent class object
"""
def __init__(self, perimeter="data/20??-*-*-*", primary=-1, **log_kws):
super().__init__(
perimeter=perimeter, primary=primary, name="HstCalScanner", **log_kws
)
self.labels = ["2g", "8g", "16g", "64g"]
self.classes = [0, 1, 2, 3]
self.res_keys = dict(mem_bin=None, memory=None, wallclock=None)
self.target = list(self.res_keys.keys())[0]
self.data = self.select_dataset()
self.mega = self.make_mega()
self.kwargs = dict(index_col="ipst")
self.decoder = {"instr": {0: "acs", 1: "cos", 2: "stis", 3: "wfc3"}}
[docs] def scan_results(self):
"""Scans local disk for Computer object-generated results files and stores
them as new Compute objects (according to the model type) in a nested dictionary.
Returns
-------
HstCalScanner.mega dictionary attribute
dictionary of model training results for each iteration found.
"""
com_objects = []
for d in self.datapaths:
coms = self.load_com_objects(d)
com_objects.append(coms)
del coms
for i in list(range(len(self.versions))):
v = self.versions[i]
b, m, w = com_objects[i]
self.mega[v]["res"] = dict(mem_bin=b, memory=m, wallclock=w)
del b, m, w
return self.mega
[docs] def load_com_objects(self, dpath):
"""Loads Multi classifier and Regression compute objects (3 total) for a single iteration of results
Parameters
----------
dpath : str
dataset subdirectory path, e.g. "data/2022-02-03/results"
Returns
-------
tuple
tuple of mem_bin, memory, wallclock compute objects for one iteration
"""
B = super().load_compute_object(
Com=ComputeMulti, alg="clf", res_path=f"{dpath}/results/mem_bin"
)
M = super().load_compute_object(
Com=ComputeRegressor, alg="linreg", res_path=f"{dpath}/results/memory"
)
W = super().load_compute_object(
Com=ComputeRegressor, alg="linreg", res_path=f"{dpath}/results/wallclock"
)
return (B, M, W)
[docs]class HstSvmScanner(MegaScanner):
"""MegaScanner subclass for HST Single Visit Mosaic alignment model training iteration analysis
Parameters
----------
MegaScanner : parent class object
MegaScanner object
"""
def __init__(self, perimeter="data/20??-*-*-*", primary=-1, **log_kws):
super().__init__(
perimeter=perimeter, primary=primary, name="HstSvmScanner", **log_kws
)
self.labels = ["aligned", "misaligned"]
self.classes = [0, 1]
self.res_keys = {"test": {}, "val": {}}
self.target = list(self.res_keys.keys())[0]
self.data = self.select_dataset()
self.mega = self.make_mega()
self.kwargs = dict(index_col="index")
self.decoder = {"det": {0: "hrc", 1: "ir", 2: "sbc", 3: "uvis", 4: "wfc"}}
[docs] def scan_results(self):
"""Scans local disk for Computer object-generated results files and stores
them as new Compute objects (according to the model type) in a nested dictionary.
Returns
-------
HstSvmScanner.mega dictionary attribute
dictionary of model training results for each iteration found.
"""
com_objects = []
for d in self.datapaths:
coms = self.load_com_objects(d)
com_objects.append(coms)
del coms
for i in list(range(len(self.versions))):
v = self.versions[i]
tcom, vcom = com_objects[i]
self.mega[v]["res"] = dict(test=tcom, val=vcom)
del tcom, vcom
# return self.mega
[docs] def load_com_objects(self, dpath):
"""Load Binary classifier compute objects for a single iteration of test and validation results
Parameters
----------
dpath : str
dataset subdirectory path, e.g. "data/2022-02-03/results"
Returns
-------
tuple
tuple of test and validation compute objects for one iteration
"""
T = super().load_compute_object(
Com=ComputeBinary, alg="binary", res_path=f"{dpath}/results/test"
)
V = super().load_compute_object(
Com=ComputeBinary,
alg="binary",
res_path=f"{dpath}/results/val",
validation=True,
)
return (T, V)
class JwstCalScanner(MegaScanner):
def __init__(self, perimeter="data/20??-*-*-*", primary=-1, **log_kws):
super().__init__(
perimeter=perimeter, primary=primary, name="JwstCalScanner", **log_kws
)
self.labels = []
self.classes = []
self.res_keys = dict(img3_reg=None)
self.target = list(self.res_keys.keys())[0]
self.data = self.select_dataset()
self.mega = self.make_mega()
self.kwargs = dict(index_col="img_name")
self.decoder = KEYPAIR_DATA
def scan_results(self):
"""Scans local disk for Computer object-generated results files and stores
them as new Compute objects (according to the model type) in a nested dictionary.
Returns
-------
HstCalScanner.mega dictionary attribute
dictionary of model training results for each iteration found.
"""
com_objects = []
for d in self.datapaths:
coms = self.load_com_objects(d)
com_objects.append(coms)
del coms
for i in list(range(len(self.versions))):
v = self.versions[i]
(im3) = com_objects[i]
self.mega[v]["res"] = dict(img3_reg=im3)
del im3
return self.mega
def load_com_objects(self, dpath):
"""Loads Multi classifier and Regression compute objects (3 total) for a single iteration of results
Parameters
----------
dpath : str
dataset subdirectory path, e.g. "data/2022-02-03/results"
Returns
-------
tuple
tuple of mem_bin, memory, wallclock compute objects for one iteration
"""
im3 = super().load_compute_object(
Com=ComputeRegressor, alg="linreg", res_path=f"{dpath}/results/img3_reg"
)
return im3