Source code for cyclum.tuning

"""Auto tuning."""

import cyclum.models

import sklearn.decomposition
import sklearn.metrics
import math


[docs]class CyclumAutoTune(cyclum.models.AutoEncoder): """Circular autoencoder with automatically decided number of linear components We first perform PCA on the data, and record the MSE of having first 1, 2, ..., max_linear_dims + 1 components. We then try to train a circular autoencoder with 0, 1, ..., max_linear_dims linear components. We compare circular autoencoder with i linear components with PCA with (i + 1) components, for i = 0, 1, ... We record the first i where the difference of loss compared with PCA is greater than both (i - 1) and (i + 1), or just (i + 1) if i == 0. At the end, this class will be a UNTRAINED model, which has optimal numbers of linear components. You can train it will all your data, more epochs, and better learning rate. :param data: The data used to decide number of linear components. For a large dataset, you may use a representative portion of it. :param max_linear_dims: maximum number of linear dimensions. :param epochs: number of epochs for each test :param verbose: per how many epochs does it report the loss, time consumption, etc. :param rate: training rate :param early_stop: Stop checking more linear components when result decided? ONLY affects the elbow plot. NO influence on result. :param encoder_depth: depth of encoder, i.e., number of *hidden* layers :param encoder_width: width of encoder, one of the following: * An integer stands for number of nodes per layer. All hidden layers will have the same number of nodes. * A list, whose length is equal to `encoder_depth`, of integers stand for numbers of nodes of the layers. :param dropout_rate: rate for dropout. :param nonlinear_reg: strength of regularization on the nonlinear encoder. :param linear_reg: strength of regularization on the linear encoder. Examples: >>> from cyclum.hdfrw import hdf2mat, mat2hdf >>> df = hdf2mat('path_to_hdf.h5') >>> m = CyclumAutoTune(df.values, max_linear_dims=5) >>> m.train(df.values) >>> pseudotime = m.predict_pseudotime(df.values) >>> mat2hdf(pseudotime, 'path_to_pseudotime.h5') """ def __init__(self, data, max_linear_dims=3, epochs=500, verbose=100, rate=5e-4, early_stop=False, encoder_depth=2, encoder_width=50, dropout_rate=0.1, nonlinear_reg=1e-4, linear_reg=1e-4): print("Auto tuning number of linear components...") self.max_linear_dims = max_linear_dims print("Performing PCA...") pca_model = sklearn.decomposition.PCA(n_components=self.max_linear_dims + 2) pca_load = pca_model.fit_transform(data) pca_comp = pca_model.components_ self.pca_loss = [sklearn.metrics.mean_squared_error(data, pca_load[:, 0:(i + 1)] @ pca_comp[0:(i + 1), :]) for i in range(self.max_linear_dims + 2)] print("Training Autoencoder with...") self.ae_loss = [] best_n_linear_dims = None for i in range(self.max_linear_dims + 1): print(f" {i} linear dimensions...") model = cyclum.models.AutoEncoder(input_width=data.shape[1], encoder_depth=encoder_depth, encoder_width=encoder_width, n_circular_unit=1, n_logistic_unit=0, n_linear_unit=0, n_linear_bypass=i, dropout_rate=dropout_rate, nonlinear_reg=nonlinear_reg, linear_reg=linear_reg) history = model.train(data, epochs=epochs, verbose=verbose, rate=rate) self.ae_loss.append(history.history['loss'][-1]) print(self.ae_loss) print(self.pca_loss) if i > 0 and best_n_linear_dims is None and (self.pca_loss[i] - self.ae_loss[i] < self.pca_loss[i - 1] - self.ae_loss[i - 1]): best_n_linear_dims = i - 1 print(f"Found! Use {best_n_linear_dims} linear components...") if early_stop: break else: print("Early stop disabled, continue to check all cases...") if best_n_linear_dims is None: best_n_linear_dims = max_linear_dims print(f"Have not found one. Suggest raise max_linear_dims. Use {max_linear_dims} linear components...") super().__init__(input_width=data.shape[1], encoder_depth=encoder_depth, encoder_width=encoder_width, n_circular_unit=1, n_logistic_unit=0, n_linear_unit=0, n_linear_bypass=best_n_linear_dims, dropout_rate=dropout_rate, nonlinear_reg=nonlinear_reg, linear_reg=linear_reg)
[docs] def show_elbow(self): """Show an elbow plot of both PCA and autoencoder You will observe the time when autoencoder become to have a higher loss than PCA. The previous time is considered as the best model. :return: figure object """ import matplotlib.pyplot as plt fig = plt.figure() plt.plot(list(range(len(self.pca_loss))), self.pca_loss, "+-") plt.plot(list(range(len(self.ae_loss))), self.ae_loss, "x-") plt.legend(['PCA', 'AE']) plt.xticks(range(max(map(len, [self.pca_loss, self.ae_loss])))) plt.xlabel('X: X linear components or (X+1) PCs') plt.ylabel('Mean squared error') return fig
[docs] def show_bar(self, root=False): """Show a bar plot for what percentage of more loss is handled by the circular component :return: figure object """ import matplotlib.pyplot as plt import numpy fig = plt.figure() if root: linear_handled = numpy.sqrt(self.pca_loss[:-1]) - numpy.sqrt(self.pca_loss[1:]) circular_handled = numpy.sqrt(self.pca_loss[:-1]) - numpy.sqrt(self.ae_loss) else: linear_handled = numpy.array(self.pca_loss[:-1]) - numpy.array(self.pca_loss[1:]) circular_handled = numpy.array(self.pca_loss[:-1]) - numpy.array(self.ae_loss) plt.bar(list(range(len(self.ae_loss))), circular_handled / linear_handled) plt.xticks(list(range(len(self.ae_loss)))) plt.xlabel('X: X linear components') plt.ylabel('Relative loss decrease') return fig