Source code for pyssam.statistical_model_base

from abc import ABC, abstractmethod, abstractproperty
from copy import copy
from typing import Any, Tuple
from warnings import warn

import numpy as np
import sklearn
from sklearn.decomposition import PCA


[docs] class StatisticalModelBase(ABC): """Abstract base class for statistical model."""
[docs] def landmark_data_to_column(self, landmark_array): """Reduce an containing spatial information on landmarks in cartesian coordinates to essentially a set of stacked column-vectors. Remove any array dimensions with only one size e.g. (1, 1000, 3) -> (3000,) Parameters ---------- landmark_array : array_like Landmarks spatial coordinates with 2 or 3 dimensions. The final dimension should size of e.g. 3 for coordinates in 3D space. Returns ------- reduced_array : array_like Landmarks where the N-dimensional array is collapsed to N-1 dimensions. Examples ======== >>> import numpy as np >>> import pyssam >>> landmarks_in = np.random.uniform(0, 100, size=(5, 100, 3)) >>> ssm = pyssam.SSM(landmarks_in) >>> print(ssm.landmark_data_to_column(landmarks_in).shape) (5, 300) """ num_data_features = landmark_array.shape[-1] reduced_array = landmark_array.reshape( -1, self.num_landmarks * num_data_features ) return np.squeeze(reduced_array)
@property def num_landmarks(self): """Property for number of landmarks in a shape model entry. Should set by default in model class, e.g. in pyssam.SSM. """ return self._num_landmarks @num_landmarks.setter def num_landmarks(self, _num_landmarks): raise AttributeError("Not allowed to overwrite num_landmarks")
[docs] def scale_dataset(self, dataset): """Take 2D or 3D array representing landmarks for multiple samples in a population, and scale to have zero mean and unit std dev. Parameters ---------- dataset : array_like Landmarks for multiple samples in a population, with 2 or 3 dimensions Returns ------- scaled_dataset : array_like Landmarks for multiple samples, each centered on mean = 0 and std dev = 1 Raises ------ AssertionError If number of dimensions on input data are not equal to 2 or 3 Examples ======== >>> import numpy as np >>> import pyssam >>> landmarks_in = np.random.uniform(0, 100, size=(5, 100, 3)) >>> ssm = pyssam.SSM(landmarks_in) >>> scaled_landmarks = ssm.scale_dataset(landmarks_in) >>> print(abs(round(scaled_landmarks.mean(), 5)), round(scaled_landmarks.std(), 5)) 0.0 1.0 """ dataset = self._check_data_shape(dataset) aligned_dataset = dataset - dataset.mean(axis=1)[:, np.newaxis] scaled_dataset = ( aligned_dataset / aligned_dataset.std(axis=(1, 2))[:, None, None] ) return scaled_dataset
def _check_data_shape(self, dataset: np.ndarray) -> np.ndarray: """Check that data to be scaled for PCA model has ndim = 3. This is useful for appearance data, which is often a single-scalar, therefore ndim is usually 2. In this case, an additional dimension of size 1 is added. Parameters ---------- dataset : array_like N-dimension array of data to model, where each row on the first axis is one sample and each column on the second axis is a landmark value Returns ------- dataset_reshaped : array_like 3D array of data, where 2D input arrays have a third dimension of size 1. Input 3D arrays are untouched Raises ------ AssertionError If ndim not equal to 2 or 2 """ if dataset.ndim == 2: return np.expand_dims(dataset, axis=-1) elif dataset.ndim == 3: return copy(dataset) else: raise AssertionError(f"Unexpected shape {dataset.shape}")
[docs] @abstractmethod def compute_dataset_mean(self) -> np.array: """Average over all samples to produce a column-vector of the mean shape, appearance, or other quantity included in model. Returns ------- mean_columnvector : array_like """ pass
[docs] def create_pca_model( self, dataset: np.ndarray, desired_variance: float = 0.9 ) -> None: """Perform principal component analysis to create statistical model and extract modelling quantites from the PCA class. Parameters ---------- dataset : array_like 2D array of data to model, where each row on the first axis is one sample and each column on the second axis is e.g. shape or appearance for a landmark desired_variance : float Fraction of total variance to be described by the reduced-dimension model Returns ------- None Raises ------ Warning If mean of each sample in dataset not equal to 0 Warning If standard deviation of each sample in dataset not equal to 1 """ assert ( 0.0 < desired_variance <= 1.0 ), f"desired_variance out of bounds {desired_variance}" # perform principal component analysis to train shape model self.pca_object, self.required_mode_number = self.do_pca( dataset, desired_variance ) # get principal components (eigenvectors) and variances (eigenvalues) self.pca_model_components = self.pca_object.components_ self.variance = self.pca_object.explained_variance_ # get standard deviation of each component, # and initialise model parameters as zero self.std = np.sqrt(self.variance) self.model_parameters = np.zeros(len(self.std)) # save desired_variance as global variable to help post-processing and debugging self.desired_variance = desired_variance
[docs] def fit_model_parameters( self, input_sample, pca_model_components: np.ndarray, num_modes: int = 1000000, ) -> np.array: """Find the model parameters which best match the given input_sample with the trained pca model. Assumes that the shape can be described by input_sample \approx dataset_mean + model_modes \cdot model_std. Parameters ---------- input_sample : array_like Column-vector representing landmark information (e.g. shape, appearance) pca_model_components : array_like eigenvectors of covariance matrix, obtain by PCA. num_modes: int Number of principal components (or `modes') to include in model to morph. By default this is set to a high number to set all modes as included. Returns ------- model_parameters : array_like model parameters used to perturb each principal component by some amount 1D array, where values should all be within +/- 3. """ dataset_mean = self.compute_dataset_mean() model_parameters = ( np.dot( (input_sample - dataset_mean), pca_model_components[:num_modes].T, ) / self.std[:num_modes] ) return model_parameters
[docs] def morph_model( self, mean_dataset_columnvector: np.array, pca_model_components: np.ndarray, model_parameters: np.array, num_modes: int = 1000000, ) -> np.array: """Morph the mean dataset based on the PCA weights and variances, with some user-defined model parameters to create a new sample. Parameters ---------- mean_dataset_columnvector : array_like mean shape of the training data in a 1D array. pca_model_components : array_like eigenvectors of covariance matrix, obtain by PCA. model_parameters : array_like model parameters used to perturb each principal component by some amount 1D array, where values should all be within +/- 3. num_modes : int Number of principal components (or `modes') to include in model to morph. By default this is set to a high number to set all modes as included. Returns ------- morphed_output : array_like A 1D array which has been perturbed from the mean shape based on the pca_model and model_parameters. Raises ------ Warning If model parameters are outwith +/- 3 AssertionError If number of dimension in pca_model_components not equal to 2 """ if np.any(abs(model_parameters) > 3.0): if np.any(abs(model_parameters) > 10.0): raise AssertionError( f"Applying extremely large model parameter ({abs(model_parameters).max()}) " "which may produce unrealistic output" ) else: warn( f"Applying large model parameter ({abs(model_parameters).max()}) " "which may produce unrealistic output" ) assert pca_model_components.ndim == 2, ( f"pca model not of expected number of dimensions" f" (shape is {pca_model_components.shape})" ) model_weight = model_parameters * self.std[:num_modes] return mean_dataset_columnvector + np.dot( pca_model_components[:num_modes].T, model_weight )
[docs] def do_pca( self, dataset: np.ndarray, desired_variance: float = 0.9 ) -> Tuple[Any, int]: """Fit principal component analysis to given dataset. Parameters ---------- dataset : array_like 2D array of data to model, where each row on the first axis is one sample and each column on the second axis is e.g. shape or appearance for a landmark desired_variance : float Fraction of total variance to be described by the reduced-dimension model Returns ------- pca : sklearn.decomposition._pca.PCA Object containing fitted PCA information e.g. components, explained variance required_mode_number : int Number of principal components needed to produce desired_variance Raises ------ Warning If mean of each sample in dataset not equal to 0 Warning If standard deviation of each sample in dataset not equal to 1 """ self._check_data_scale(dataset) pca = PCA(svd_solver="full") pca.fit(dataset) required_mode_number = np.where( np.cumsum(pca.explained_variance_ratio_) > desired_variance )[0][0] return pca, required_mode_number
def _check_data_scale(self, dataset: np.ndarray) -> None: """Check that data is appropriate for PCA, meaning that each sample has 0 mean and 1 std. Parameters ---------- dataset : array_like N-dimension array of data to model, where each row on the first axis is one sample and each column on the second axis is a landmark value Returns ------- None Raises ------ Warning If mean of each sample in dataset not equal to 0 Warning If standard deviation of each sample in dataset not equal to 1 """ if np.allclose(dataset.mean(axis=1), 0): pass else: warn("Dataset mean should be 0, " f"is equal to {dataset.mean(axis=1)}") if np.allclose(dataset.std(axis=1), 1): pass else: warn( "Dataset standard deviation should be 1, " f"is equal to {dataset.std(axis=1)}" )