Source code for pyssam.statistical_model_base

from abc import ABC, abstractmethod, abstractproperty
from copy import copy
from typing import Any, Tuple
from warnings import warn

import numpy as np
import sklearn
from sklearn.decomposition import PCA



[docs]
class StatisticalModelBase(ABC):
  """Abstract base class for statistical model."""


[docs]
  def landmark_data_to_column(self, landmark_array):
    """Reduce an containing spatial information on landmarks in cartesian
    coordinates to essentially a set of stacked column-vectors. Remove any
    array dimensions with only one size e.g. (1, 1000, 3) -> (3000,)

    Parameters
    ----------
    landmark_array : array_like
        Landmarks spatial coordinates with 2 or 3 dimensions. The final dimension
        should size of e.g. 3 for coordinates in 3D space.

    Returns
    -------
    reduced_array : array_like
        Landmarks where the N-dimensional array is collapsed to N-1 dimensions.

    Examples
    ========
    >>> import numpy as np
    >>> import pyssam
    >>> landmarks_in = np.random.uniform(0, 100, size=(5, 100, 3))
    >>> ssm = pyssam.SSM(landmarks_in)
    >>> print(ssm.landmark_data_to_column(landmarks_in).shape)
    (5, 300)
    """
    num_data_features = landmark_array.shape[-1]
    reduced_array = landmark_array.reshape(
      -1, self.num_landmarks * num_data_features
    )
    return np.squeeze(reduced_array)


  @property
  def num_landmarks(self):
    """Property for number of landmarks in a shape model entry.

    Should set by default in model class, e.g. in pyssam.SSM.
    """
    return self._num_landmarks

  @num_landmarks.setter
  def num_landmarks(self, _num_landmarks):
    raise AttributeError("Not allowed to overwrite num_landmarks")


[docs]
  def scale_dataset(self, dataset):
    """Take 2D or 3D array representing landmarks for multiple samples in a
    population, and scale to have zero mean and unit std dev.

    Parameters
    ----------
    dataset : array_like
        Landmarks for multiple samples in a population, with 2 or 3 dimensions

    Returns
    -------
    scaled_dataset : array_like
        Landmarks for multiple samples, each centered on mean = 0 and std dev = 1

    Raises
    ------
    AssertionError
        If number of dimensions on input data are not equal to 2 or 3

    Examples
    ========
    >>> import numpy as np
    >>> import pyssam
    >>> landmarks_in = np.random.uniform(0, 100, size=(5, 100, 3))
    >>> ssm = pyssam.SSM(landmarks_in)
    >>> scaled_landmarks = ssm.scale_dataset(landmarks_in)
    >>> print(abs(round(scaled_landmarks.mean(), 5)), round(scaled_landmarks.std(), 5))
    0.0 1.0
    """
    dataset = self._check_data_shape(dataset)
    aligned_dataset = dataset - dataset.mean(axis=1)[:, np.newaxis]
    scaled_dataset = (
      aligned_dataset / aligned_dataset.std(axis=(1, 2))[:, None, None]
    )
    return scaled_dataset


  def _check_data_shape(self, dataset: np.ndarray) -> np.ndarray:
    """Check that data to be scaled for PCA model has ndim = 3. This is useful
    for appearance data, which is often a single-scalar, therefore ndim is usually 2.
    In this case, an additional dimension of size 1 is added.

    Parameters
    ----------
    dataset : array_like
        N-dimension array of data to model, where each row on the first axis is one sample
        and each column on the second axis is a landmark value

    Returns
    -------
    dataset_reshaped : array_like
        3D array of data, where 2D input arrays have a third dimension of size 1.
        Input 3D arrays are untouched

    Raises
    ------
    AssertionError
        If ndim not equal to 2 or 2
    """
    if dataset.ndim == 2:
      return np.expand_dims(dataset, axis=-1)
    elif dataset.ndim == 3:
      return copy(dataset)
    else:
      raise AssertionError(f"Unexpected shape {dataset.shape}")


[docs]
  @abstractmethod
  def compute_dataset_mean(self) -> np.array:
    """Average over all samples to produce a column-vector of the mean shape,
    appearance, or other quantity included in model.

    Returns
    -------
    mean_columnvector : array_like
    """
    pass



[docs]
  def create_pca_model(
    self, dataset: np.ndarray, desired_variance: float = 0.9
  ) -> None:
    """Perform principal component analysis to create statistical model and
    extract modelling quantites from the PCA class.

    Parameters
    ----------
    dataset : array_like
        2D array of data to model, where each row on the first axis is one sample
        and each column on the second axis is e.g. shape or appearance for a landmark
    desired_variance : float
        Fraction of total variance to be described by the reduced-dimension model

    Returns
    -------
    None

    Raises
    ------
    Warning
        If mean of each sample in dataset not equal to 0
    Warning
        If standard deviation of each sample in dataset not equal to 1
    """
    assert (
      0.0 < desired_variance <= 1.0
    ), f"desired_variance out of bounds {desired_variance}"
    # perform principal component analysis to train shape model
    self.pca_object, self.required_mode_number = self.do_pca(
      dataset, desired_variance
    )
    # get principal components (eigenvectors) and variances (eigenvalues)
    self.pca_model_components = self.pca_object.components_
    self.variance = self.pca_object.explained_variance_
    # get standard deviation of each component,
    # and initialise model parameters as zero
    self.std = np.sqrt(self.variance)
    self.model_parameters = np.zeros(len(self.std))
    # save desired_variance as global variable to help post-processing and debugging
    self.desired_variance = desired_variance



[docs]
  def fit_model_parameters(
    self,
    input_sample,
    pca_model_components: np.ndarray,
    num_modes: int = 1000000,
  ) -> np.array:
    """Find the model parameters which best match the given input_sample with
    the trained pca model. Assumes that the shape can be described by
    input_sample \approx dataset_mean + model_modes \cdot model_std.

    Parameters
    ----------
    input_sample : array_like
        Column-vector representing landmark information (e.g. shape, appearance)
    pca_model_components : array_like
        eigenvectors of covariance matrix, obtain by PCA.
    num_modes: int
        Number of principal components (or `modes') to include in model
        to morph. By default this is set to a high number to set all modes
        as included.

    Returns
    -------
    model_parameters : array_like
        model parameters used to perturb each principal component by some amount
        1D array, where values should all be within +/- 3.
    """
    dataset_mean = self.compute_dataset_mean()
    model_parameters = (
      np.dot(
        (input_sample - dataset_mean),
        pca_model_components[:num_modes].T,
      )
      / self.std[:num_modes]
    )
    return model_parameters



[docs]
  def morph_model(
    self,
    mean_dataset_columnvector: np.array,
    pca_model_components: np.ndarray,
    model_parameters: np.array,
    num_modes: int = 1000000,
  ) -> np.array:
    """Morph the mean dataset based on the PCA weights and variances, with some
    user-defined model parameters to create a new sample.

    Parameters
    ----------
    mean_dataset_columnvector : array_like
        mean shape of the training data in a 1D array.
    pca_model_components : array_like
        eigenvectors of covariance matrix, obtain by PCA.
    model_parameters : array_like
        model parameters used to perturb each principal component by some amount
        1D array, where values should all be within +/- 3.
    num_modes : int
        Number of principal components (or `modes') to include in model
        to morph. By default this is set to a high number to set all modes
        as included.

    Returns
    -------
    morphed_output : array_like
        A 1D array which has been perturbed from the mean shape based
        on the pca_model and model_parameters.

    Raises
    ------
    Warning
        If model parameters are outwith +/- 3
    AssertionError
        If number of dimension in pca_model_components not equal to 2
    """
    if np.any(abs(model_parameters) > 3.0):
      if np.any(abs(model_parameters) > 10.0):
        raise AssertionError(
          f"Applying extremely large model parameter ({abs(model_parameters).max()}) "
          "which may produce unrealistic output"
        )
      else:
        warn(
          f"Applying large model parameter ({abs(model_parameters).max()}) "
          "which may produce unrealistic output"
        )

    assert pca_model_components.ndim == 2, (
      f"pca model not of expected number of dimensions"
      f" (shape is {pca_model_components.shape})"
    )
    model_weight = model_parameters * self.std[:num_modes]
    return mean_dataset_columnvector + np.dot(
      pca_model_components[:num_modes].T, model_weight
    )



[docs]
  def do_pca(
    self, dataset: np.ndarray, desired_variance: float = 0.9
  ) -> Tuple[Any, int]:
    """Fit principal component analysis to given dataset.

    Parameters
    ----------
    dataset : array_like
        2D array of data to model, where each row on the first axis is one sample
        and each column on the second axis is e.g. shape or appearance for a landmark
    desired_variance : float
        Fraction of total variance to be described by the reduced-dimension model

    Returns
    -------
    pca : sklearn.decomposition._pca.PCA
        Object containing fitted PCA information e.g. components, explained variance
    required_mode_number : int
        Number of principal components needed to produce desired_variance

    Raises
    ------
    Warning
        If mean of each sample in dataset not equal to 0
    Warning
        If standard deviation of each sample in dataset not equal to 1
    """
    self._check_data_scale(dataset)

    pca = PCA(svd_solver="full")
    pca.fit(dataset)
    required_mode_number = np.where(
      np.cumsum(pca.explained_variance_ratio_) > desired_variance
    )[0][0]

    return pca, required_mode_number


  def _check_data_scale(self, dataset: np.ndarray) -> None:
    """Check that data is appropriate for PCA, meaning that each sample has 0
    mean and 1 std.

    Parameters
    ----------
    dataset : array_like
        N-dimension array of data to model, where each row on the first axis is one sample
        and each column on the second axis is a landmark value

    Returns
    -------
    None

    Raises
    ------
    Warning
        If mean of each sample in dataset not equal to 0
    Warning
        If standard deviation of each sample in dataset not equal to 1
    """
    if np.allclose(dataset.mean(axis=1), 0):
      pass
    else:
      warn("Dataset mean should be 0, " f"is equal to {dataset.mean(axis=1)}")

    if np.allclose(dataset.std(axis=1), 1):
      pass
    else:
      warn(
        "Dataset standard deviation should be 1, "
        f"is equal to {dataset.std(axis=1)}"
      )