MBG wiki | RecentChanges | Blog | 2024-11-21 | 2024-11-20

Comparison of sets of eigenvectors

If you have two independent trajectories (or, two independent pieces from the same trajectory), you may want to compare (quantitatively) the corresponding eigenvectors. The aim for such a comparison is to show (or otherwise) that the principal components determined from the two trajectories agree with each other (and, thus, that they may indeed correspond to the principal components of two sufficiently sampled trajectories).

The most commonly used measure, is to calculate is the overlap (Hess (2000), Phys. Rev. E, 62, 8438, Amandei et al (1999), Proteins, 36, 419) :

$$ 
overlap({\bf v}, {\bf w}) = \frac{1}{n} \sum_{i=1}^{n} \sum_{j=1}^{n} ({\bf v}_i  \cdot {\bf w}_j )^2
 $$

v and w are are two sets of eigenvectors, each set containing n eigenvectors (the operation inside the summation is the dot product).

The C program that follows implements the above equation. It will read a specified number of eigenvectors from two files, and it will calculate the overlap of the corresponding sub-spaces. The way to run the program is something in the spirit of

./a.out carma.eigenvectors.1 carma.eigenvectors.2 number_of_eigenvectors

The files should contain one (unitary) eigenvector per column (as produced by carma).

#include <stdio.h>
#include <math.h>

#define         MAXVECTORS      100
#define         MAXVECTLEN      3000
#define         MAXLINELEN      25000

main(argc, argv)
int     argc;
char    *argv[];
{
        FILE    *file1, *file2;
        int     n;
        int     i, j, l, k1, k2;
        double  inter, over;
        float   eig1[MAXVECTORS][MAXVECTLEN];
        float   eig2[MAXVECTORS][MAXVECTLEN];
        char    line[MAXLINELEN];



/* Input parsing and file handling */

        if ( argc != 4 )
                {
                        printf("Usage: eigenoverlap file1 file2 number_of_vectors\n");
                        exit(1);
                }

        file1 = fopen( argv[1], "r");
        file2 = fopen( argv[2], "r");
        if ( file1 == NULL || file2 == NULL )
                {
                        printf("Can not open input files.\n");
                        exit(1);
                }

        n = atoi( argv[3] );

        printf("First  set of vectors from file %s\n", argv[1] );
        printf("Second set of vectors from file %s\n", argv[2] );
        printf("Will calculate overlap between %d vectors\n", n );

        k1 = 0;
        while( fgets(line, MAXLINELEN, file1) != NULL )
                {
                        i = 0;
                        if ( sscanf( (char *)(strtok( line, " \t\n")), "%f", &eig1[i][k1] ) != 1 )
                                {
                                        printf("Not enough vectors in file %s\n", argv[1] );
                                        exit( 1 );
                                }
                        
                        for ( i=1 ; i < n ; i++ )
                                {
                                        if ( sscanf( (char *)(strtok( NULL, " \t\n")), "%f", &eig1[i][k1] ) != 1 )
                                                {
                                                        printf("Not enough vectors in file %s\n", argv[1] );
                                                        exit( 1 );
                                                }
                                }
                        k1++;
                }

        printf("Read %d vectors, each of length %d from file %s\n", n, k1, argv[1] );

        k2 = 0;
        while( fgets(line, MAXLINELEN, file2) != NULL )
                {
                        i = 0;
                        if ( sscanf( (char *)(strtok( line, " \t\n")), "%f", &eig2[i][k2] ) != 1 )
                                {
                                        printf("Not enough vectors in file %s\n", argv[2] );
                                        exit( 1 );
                                }
                        
                        for ( i=1 ; i < n ; i++ )
                                {
                                        if ( sscanf( (char *)(strtok( NULL, " \t\n")), "%f", &eig2[i][k2] ) != 1 )
                                                {
                                                        printf("Not enough vectors in file %s\n", argv[2] );
                                                        exit( 1 );
                                                }
                                }
                        k2++;
                }

        printf("Read %d vectors, each of length %d from file %s\n", n, k2, argv[2] );

        if ( k1 != k2 )
                {
                        printf("Dimensions of vectors not equal. Abort.\n");
                        exit( 1 );
                }


/* The actual calculation */

        over = 0.0;
        
        for ( i=0 ; i < n  ; i++ )
        for ( j=0 ; j < n  ; j++ )
                {       
                        inter = 0.0;
                        for ( l=0 ; l < k1 ; l++ )
                                inter += (double)(eig1[i][l]) * (double)(eig2[j][l]);

                        over += inter * inter;
                }

        over = over / (double)(n);

        printf("\nOverlap = %8.4f\n", (float)(over) );

}

If you would rather have it in perl :

#!/usr/bin/perl -w

$v1=[];
$v2=[];

# File handling

( scalar( @ARGV) == 3 )  || die "\n\nUsage : eigenoverlap file1 file2 number_of_vectors\n";

open ( file1, $ARGV[0] ) || die "\n\nCan not open file $ARGV[0]\n";
open ( file2, $ARGV[1] ) || die "\n\nCan not open file $ARGV[1]\n";

$n = $ARGV[2];

print "First  set of vectors from file $ARGV[0]\n";
print "Second set of vectors from file $ARGV[1]\n";
print "Will calculate overlap between $n vectors\n";

@eig1 = <file1>;
@eig2 = <file2>;

$k1 = scalar( @eig1 );
$k2 = scalar( @eig2 );

( $k1 == $k2 ) || die "\n\nDimensions of vectors not equal\n";


# Place the data in matrices for easy of use

for ( $i = 0 ; $i < $k1 ; $i++ )
  {
    @words = split( ' ', $eig1[$i]);
    ( scalar( @words ) >= $n ) || die "\n\nNot enough vectors in file $ARGV[0]\n";
    for ( $k = 0 ; $k < $n ; $k++ )
      {
        $$v1[$i][$k] = $words[$k];
      }
  }

for ( $i = 0 ; $i < $k2 ; $i++ )
  {
    @words = split( ' ', $eig2[$i]);
    ( scalar( @words ) >= $n ) || die "\n\nNot enough vectors in file $ARGV[1]\n";
    for ( $k = 0 ; $k < $n ; $k++ )
      {
        $$v2[$i][$k] = $words[$k];
      }
  }


# The actual calculation (re-writing C)

$over = 0.0;
        
        for ( $i=0 ; $i < $n  ; $i++ )
        {
        for ( $j=0 ; $j < $n  ; $j++ )
                {       
                        $inter = 0.0;
                        for ( $l=0 ; $l < $k1 ; $l++ )
                                {
                                  $inter += $$v1[$l][$i] * $$v2[$l][$j];
                                }

                        $over += $inter * $inter;
                }
        }
        
        $over = $over / $n;

        printf "\n\nOverlap = %6.4f\n", $over;


close( file2 );
close( file1 );