Source code for dyconnmap.graphs.vi

# -*- coding: utf-8 -*-
""" Variation of Information

Variation of Information (*VI*) [Meilla2007]_ is an information theoretic criterion
for comparing two partitions. It is based on the classic notions of entropy and mutual information.
In a nutshell, VI measures the amount of information that is lost or gained in changing from
clustering :math:`A` to clustering :math:`B`. VI is a true metric, is always non-negative and symmetric.
The following formula is used to compute the VI between two groups:

.. math::
    VI(A, B) = [H(A) - I(A, B)] + [H(B) - I(A, B)]

Where :math:`H` denotes the entropy computed for each partition separately,
and :math:`I` the mutual information between clusterings :math:`A` and :math:`B`.

The resulting distance score can be adjusted to bound it between :math:`[0, 1]` as follows:

.. math::
    VI^{*}(A,B) = \\frac{1}{\\log{n}}VI(A, B)


|

-----

.. [Meilla2007] Meilă, M. (2007). Comparing clusterings—an information based distance. Journal of multivariate analysis, 98(5), 873-895.
.. [Dimitriadis2009] Dimitriadis, S. I., Laskaris, N. A., Del Rio-Portilla, Y., & Koudounis, G. C. (2009). Characterizing dynamic functional connectivity across sleep stages from EEG. Brain topography, 22(2), 119-133.
.. [Dimitriadis2012] Dimitriadis, S. I., Laskaris, N. A., Michael Vourkas, V. T., & Micheloyannis, S. (2012). An EEG study of brain connectivity dynamics at the resting state. Nonlinear Dynamics-Psychology and Life Sciences, 16(1), 5.
"""
# Author: Avraam Marimpis <avraam.marimpis@gmail.com>
# Author: Stavros Dimitriadis <stdimitr@gmail.com>

import numpy as np

from dyconnmap.ts.entropy import entropy


[docs]def variation_information(indices_a: np.ndarray, indices_b: np.ndarray) -> float:
    """ Variation of Information


    Parameters
    ----------
    indices_a : array-like, shape(n_samples)
        Symbolic time series.

    indices_b : array-like, shape(n_samples)
        Symbolic time series.


    Returns
    -------
    vi : float
        Variation of information.
    """
    n1 = len(indices_a)
    n2 = len(indices_b)

    if n1 != n2:
        pass

    entropy1 = entropy(indices_a)
    entropy2 = entropy(indices_b)

    MI, _ = __mi(indices_a, -entropy1, indices_b, -entropy2)

    entropy1 = -entropy1
    entropy2 = -entropy2
    VI_value = entropy1 + entropy2 - 2 * MI

    NVI = VI_value / np.log(n1)

    return VI_value, NVI


def __unique_symbols(indices):
    """

    """
    N = len(indices)
    unique, counts = np.unique(indices, return_counts=True)
    len_counts = len(counts)
    U = np.zeros((len_counts, N))
    indices = indices.flatten()
    for i in range(len_counts):
        tmp = np.where(indices == unique[i])
        U[i, tmp[0]] = 1

    return U


def __mi(indices_a, entropy_a, indices_b, entropy_b):
    """

    """
    N = len(indices_a)

    Ua = __unique_symbols(indices_a)
    Ub = __unique_symbols(indices_b)

    Sab = Ua.dot(Ub.T) / np.float32(N)
    Sa = np.diag(Ua.dot(Ua.T) / np.float32(N))
    Sb = np.diag(Ub.dot(Ub.T) / np.float32(N))

    # Add dummy dimension (needed for following computations).
    Sa = np.expand_dims(Sa, axis=1)
    Sb = np.expand_dims(Sb, axis=1)

    SS = Sab * np.log10(Sab / (Sa * Sb.T))

    MI = np.nansum(np.nansum(SS))
    NMI = 2 * MI / (entropy_a + entropy_b)

    return MI, NMI