From 1e8beff8c738d8555792c52cbbb99c6293483693 Mon Sep 17 00:00:00 2001 From: imitxelena003 Date: Mon, 30 Aug 2021 16:28:38 +0200 Subject: [PATCH 1/5] new method to compute T from C --- mastermsm/msm/msm.py | 5 +++-- mastermsm/msm/msm_lib.py | 25 +++++++++++++++++-------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/mastermsm/msm/msm.py b/mastermsm/msm/msm.py index a8d9bb7..324bbe1 100644 --- a/mastermsm/msm/msm.py +++ b/mastermsm/msm/msm.py @@ -7,6 +7,7 @@ from functools import reduce, cmp_to_key import itertools import numpy as np +from numpy.core.numeric import normalize_axis_tuple import networkx as nx import scipy.linalg as spla import matplotlib.pyplot as plt @@ -353,7 +354,7 @@ def do_count(self, sliding=True): self.count += self.count.transpose() self.keep_states, self.keep_keys = self.check_connect() - def do_trans(self, evecs=False): + def do_trans(self, evecs=False, normalize=False): """ Wrapper for transition matrix calculation. Also calculates its eigenvalues and right eigenvectors. @@ -368,7 +369,7 @@ def do_trans(self, evecs=False): nkeep = len(self.keep_states) keep_states = self.keep_states count = self.count - self.trans = msm_lib.calc_trans(nkeep, keep_states, count) + self.trans = msm_lib.calc_trans(nkeep, keep_states, count, normalize=normalize) if not evecs: self.tauT, self.peqT = self.calc_eigsT() else: diff --git a/mastermsm/msm/msm_lib.py b/mastermsm/msm/msm_lib.py index f5e66c8..61c622a 100644 --- a/mastermsm/msm/msm_lib.py +++ b/mastermsm/msm/msm_lib.py @@ -536,15 +536,15 @@ def do_boots_worker(x): peqT = rvecsT[:,ieqT]/peqT_sum return tauT, peqT, trans, keep_keys -def calc_trans(nkeep=None, keep_states=None, count=None): +def calc_trans(nkeep=None, keep_states=None, count=None, normalize=False): """ Calculates transition matrix. Uses the maximum likelihood expression by Prinz et al.[1]_ Parameters ---------- - lagt : float - Lag time for construction of MSM. + normalize : boolean + Use method of Zimmerman et al. JCTC 2018 Returns ------- @@ -558,11 +558,20 @@ def calc_trans(nkeep=None, keep_states=None, count=None): Generation and validation", J. Chem. Phys. (2011). """ trans = np.zeros([nkeep, nkeep], float) - for i in range(nkeep): - ni = reduce(lambda x, y: x + y, map(lambda x: - count[keep_states[x]][keep_states[i]], range(nkeep))) - for j in range(nkeep): - trans[j][i] = float(count[keep_states[j]][keep_states[i]])/float(ni) + if normalize: + pseudo = 1./float(nkeep) + for i in range(nkeep): + ni = reduce(lambda x, y: x + y, map(lambda x: + count[keep_states[x]][keep_states[i]], range(nkeep))) + for j in range(nkeep): + trans[j][i] = float(count[keep_states[j]][keep_states[i]]) + pseudo + trans[j][i] /= float(ni + pseudo) + else: + for i in range(nkeep): + ni = reduce(lambda x, y: x + y, map(lambda x: + count[keep_states[x]][keep_states[i]], range(nkeep))) + for j in range(nkeep): + trans[j][i] = float(count[keep_states[j]][keep_states[i]])/float(ni) return trans def calc_rate(nkeep, trans, lagt): From d638e5c1e96dc16b295295a9ef53015db2b43539 Mon Sep 17 00:00:00 2001 From: imitxelena003 Date: Mon, 6 Sep 2021 13:18:27 +0200 Subject: [PATCH 2/5] tica, not finished --- mastermsm/trajectory/traj.py | 12 +++- mastermsm/trajectory/traj_lib.py | 97 +++++++++++++++++++++++++++++++- 2 files changed, 107 insertions(+), 2 deletions(-) diff --git a/mastermsm/trajectory/traj.py b/mastermsm/trajectory/traj.py index 97c1f3a..5a69917 100644 --- a/mastermsm/trajectory/traj.py +++ b/mastermsm/trajectory/traj.py @@ -201,7 +201,7 @@ def _read_distraj(self, distraj=None, dt=None): return cstates, 1. def discretize(self, method="rama", states=None, nbins=20,\ - mcs=100, ms=50): + mcs=100, ms=50, lagt=None): """ Discretize the simulation data. Parameters @@ -219,6 +219,8 @@ def discretize(self, method="rama", states=None, nbins=20,\ min_cluster_size for HDBSCAN ms : int min_samples for HDBSCAN + lagt : int + Lag time for TICA Returns ------- @@ -240,6 +242,14 @@ def discretize(self, method="rama", states=None, nbins=20,\ self.distraj = traj_lib.discrete_backbone_torsion(mcs, ms, phi=phi, psi=psi) elif method == "contacts_hdb": self.distraj = traj_lib.discrete_contacts_hdbscan(mcs, ms, self.mdt) + elif method == "tica": + dists = md.compute_contacts(self.mdt, contacts='all', scheme='ca', periodic=True) + x = [] + for dist in dists[0]: + x.append(dist) + x = np.transpose(x) + if lagt == None: lagt = self.dt + self.distraj = traj_lib.tica(x,lagt) def find_keys(self, exclude=['O']): """ Finds out the discrete states in the trajectory diff --git a/mastermsm/trajectory/traj_lib.py b/mastermsm/trajectory/traj_lib.py index 8bec510..1c78a2b 100644 --- a/mastermsm/trajectory/traj_lib.py +++ b/mastermsm/trajectory/traj_lib.py @@ -6,8 +6,10 @@ import copy import sys import math -import hdbscan import numpy as np +from numpy.core.fromnumeric import mean +from scipy import linalg as spla +import hdbscan from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA import mdtraj as md @@ -425,3 +427,96 @@ def _shift(psi, phi): if psi_s[i] > 2: psi_s[i] -= 2*np.pi return psi_s, phi_s + +def tica(x,tau,dim=2): + """ + Calculate TICA components for features trajectory + 'x' with lag time 'tau'. + JCTC Schultze et al. 2021 + + Parameters + ----------- + x : array + Array with features for each frame of the discrete trajectory. + tau : int + Lag time corresponding to the discrete trajectory. + dim : int + Number of TICA dimensions to be computed. + + Returns: + ------- + evals : numpy array + Resulting eigenvalues + evecs : numpy array + Resulting reaction coordinates + + """ + + # IONIX: ORGANIZAR x de modo que x[0] contiene la lista de los + # valores del primer feature para todos los frames, x[1] + # la lista del segundo feature, etc. + print('tau value for TICA:',tau) + + # compute mean free x + x = meanfree(x) + # estimate covariant symmetrized matrices + cmat, cmat0 = covmatsym(x,tau) + # solve generalized eigenvalue problem + n = len(x) + evals, evecs = \ + spla.eigh(cmat,b=cmat0,eigvals_only=False,subset_by_index=[n-dim,n-1]) + + return evals, evecs + +def meanfree(x): + """ + Compute mean free coordinates, i.e. + with zero time average. + + """ + for i,xval in enumerate(x): + x[i] = xval - np.mean(xval) + return x + +def covmatsym(x,tau): + """ + Build symmetrized covariance matrices. + + """ + cmat = np.zeros((len(x),len(x)),float) + for i,xval in enumerate(x): + for j in range(i): + cmat[i][j] = covmat(xval,x[j],tau) + cmat /= float(len(x[0])-tau) + cmat *= 0.5 + + cmat0 = np.zeros((len(x),len(x)),float) + for i,xval in enumerate(x): + for j in range(i): + cmat0[i][j] = covmat0(xval,x[j],tau) + cmat0 /= float(len(x[0])-tau) + cmat0 *= 0.5 + + return cmat, cmat0 + +def covmat(x,y,tau): + """ + Calculate covariance matrices. + + """ + if len(x) != len(y): sys.exit('cannot obtain covariance matrices') + aux = 0.0 + for i,xval in enumerate(x): + aux += xval*y[i+tau] + x[i+tau]*y[i] + if i == (len(x)-tau-1): return aux + +def covmat0(x,y,tau): + """ + Calculate covariance matrices. + + """ + if len(x) != len(y): sys.exit('cannot obtain covariance matrices') + aux = 0.0 + for i,xval in enumerate(x): + aux += xval*y[i] + x[i+tau]*y[i+tau] + if i == (len(x)-tau-1): return aux \ No newline at end of file From f6aea1fe1d56f5d51826037b8347c9edad47a5dc Mon Sep 17 00:00:00 2001 From: imitxelena003 Date: Tue, 7 Sep 2021 15:09:41 +0200 Subject: [PATCH 3/5] tica, not bugs, result wrong yet --- mastermsm/trajectory/traj.py | 29 ++++++++++++++++++++++++----- mastermsm/trajectory/traj_lib.py | 16 +++++++++------- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/mastermsm/trajectory/traj.py b/mastermsm/trajectory/traj.py index 5a69917..5bb8a85 100644 --- a/mastermsm/trajectory/traj.py +++ b/mastermsm/trajectory/traj.py @@ -201,7 +201,7 @@ def _read_distraj(self, distraj=None, dt=None): return cstates, 1. def discretize(self, method="rama", states=None, nbins=20,\ - mcs=100, ms=50, lagt=None): + mcs=100, ms=50): """ Discretize the simulation data. Parameters @@ -219,8 +219,6 @@ def discretize(self, method="rama", states=None, nbins=20,\ min_cluster_size for HDBSCAN ms : int min_samples for HDBSCAN - lagt : int - Lag time for TICA Returns ------- @@ -242,14 +240,35 @@ def discretize(self, method="rama", states=None, nbins=20,\ self.distraj = traj_lib.discrete_backbone_torsion(mcs, ms, phi=phi, psi=psi) elif method == "contacts_hdb": self.distraj = traj_lib.discrete_contacts_hdbscan(mcs, ms, self.mdt) - elif method == "tica": + + def tica(self, method='contacts', lagt=None): + """ TICA dimensionality reduction. + + Parameters + ---------- + method : str + Compute input for TICA + lagt : int + Lag time for TICA + + Returns + ------- + evals : array + Eigenvalues of TICA coordinate transformation + evecs : array + Eigenvectors of TICA coordinate transformation + + """ + if method == "contacts": dists = md.compute_contacts(self.mdt, contacts='all', scheme='ca', periodic=True) x = [] for dist in dists[0]: x.append(dist) x = np.transpose(x) if lagt == None: lagt = self.dt - self.distraj = traj_lib.tica(x,lagt) + evals, evecs = traj_lib.tica_worker(x,lagt) + ###self.distraj = PROJECT trajectory onto TICA vectors? + return evals, evecs def find_keys(self, exclude=['O']): """ Finds out the discrete states in the trajectory diff --git a/mastermsm/trajectory/traj_lib.py b/mastermsm/trajectory/traj_lib.py index 1c78a2b..cdc3de5 100644 --- a/mastermsm/trajectory/traj_lib.py +++ b/mastermsm/trajectory/traj_lib.py @@ -428,7 +428,7 @@ def _shift(psi, phi): psi_s[i] -= 2*np.pi return psi_s, phi_s -def tica(x,tau,dim=2): +def tica_worker(x,tau,dim=2): """ Calculate TICA components for features trajectory 'x' with lag time 'tau'. @@ -452,9 +452,9 @@ def tica(x,tau,dim=2): """ - # IONIX: ORGANIZAR x de modo que x[0] contiene la lista de los - # valores del primer feature para todos los frames, x[1] - # la lista del segundo feature, etc. + # ORGANIZAR x de modo que x[0] contiene la lista de los + # valores del primer feature para todos los frames, x[1] + # la lista del segundo feature, etc. print('tau value for TICA:',tau) # compute mean free x @@ -464,7 +464,8 @@ def tica(x,tau,dim=2): # solve generalized eigenvalue problem n = len(x) evals, evecs = \ - spla.eigh(cmat,b=cmat0,eigvals_only=False,subset_by_index=[n-dim,n-1]) + spla.eig(cmat,b=cmat0) + #spla.eigh(cmat,b=cmat0,eigvals_only=False,subset_by_index=[n-dim,n-1]) return evals, evecs @@ -480,7 +481,8 @@ def meanfree(x): def covmatsym(x,tau): """ - Build symmetrized covariance matrices. + Build symmetrized covariance + matrices (left). """ cmat = np.zeros((len(x),len(x)),float) @@ -501,7 +503,7 @@ def covmatsym(x,tau): def covmat(x,y,tau): """ - Calculate covariance matrices. + Calculate covariance matrices (right). """ if len(x) != len(y): sys.exit('cannot obtain covariance matrices') From 15de5e70a4b084e426af7941c9fa195ec9e83a14 Mon Sep 17 00:00:00 2001 From: imitxelena003 Date: Thu, 30 Sep 2021 11:50:18 +0200 Subject: [PATCH 4/5] solve conflicts --- mastermsm/trajectory/traj_lib.py | 161 ++++++++++++++++--------------- 1 file changed, 85 insertions(+), 76 deletions(-) diff --git a/mastermsm/trajectory/traj_lib.py b/mastermsm/trajectory/traj_lib.py index cdc3de5..a33121c 100644 --- a/mastermsm/trajectory/traj_lib.py +++ b/mastermsm/trajectory/traj_lib.py @@ -6,14 +6,14 @@ import copy import sys import math -import numpy as np -from numpy.core.fromnumeric import mean -from scipy import linalg as spla import hdbscan +import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA import mdtraj as md import matplotlib.pyplot as plt +from numpy.core.fromnumeric import mean +from scipy import linalg as spla def discrete_rama(phi, psi, seq=None, bounds=None, states=['A', 'E', 'L']): """ Assign a set of phi, psi angles to coarse states. @@ -218,10 +218,14 @@ def _stategrid(phi, psi, nbins): ibin = int(0.5*nbins*(phi/math.pi + 1.)) + int(0.5*nbins*(psi/math.pi + 1))*nbins return ibin -def discrete_backbone_torsion(mcs, ms, phi=None, psi=None, pcs=None, dPCA=False): - """ Assign a set of phi, psi angles (or their corresponding - dPCA variables if dPCA=True) to coarse states - by using the HDBSCAN algorithm +def discrete_backbone_torsion(mcs, ms, phi=None, psi=None, \ + pcs=None, dPCA=False): + """ + Discretize backbone torsion angles + + Assign a set of phi, psi angles (or their corresponding + dPCA variables if dPCA=True) to coarse states + by using the HDBSCAN algorithm. Parameters ---------- @@ -238,63 +242,68 @@ def discrete_backbone_torsion(mcs, ms, phi=None, psi=None, pcs=None, dPCA=False) min_samples for HDBSCAN """ - if dPCA is not True: + if dPCA: + X = pcs + else: + # shift and combine dihedrals if len(phi[0]) != len(psi[0]): - sys.exit() - + raise ValueError("Inconsistent dimensions for angles") + ndih = len(phi[0]) - #print(len(psi[1]), ndih, len(phi[1])) - phis, psis = [], [] - for f, y in zip(phi[1],psi[1]): + phi_shift, psi_shift = [], [] + for f, y in zip(phi[1], psi[1]): for n in range(ndih): - phis.append(f[n]) - psis.append(y[n]) + phi_shift.append(f[n]) + psi_shift.append(y[n]) + np.savetxt("phi_psi.dat", np.column_stack((phi_shift, psi_shift))) + psi_shift, phi_shift = _shift(psi_shift, phi_shift) + data = np.column_stack((phi_shift, psi_shift)) + np.savetxt("phi_psi_shifted.dat", data) + X = StandardScaler().fit_transform(data) + + # Set values for clustering parameters + if mcs is None: + mcs = int(np.sqrt(len(X))) + print("Setting minimum cluster size to: %g" % mcs) + if ms is None: + ms = mcs + print("Setting min samples to: %g" % ms) + + hdb = hdbscan.HDBSCAN(min_cluster_size=mcs, min_samples=ms).fit(X) + hdb.condensed_tree_.plot(select_clusters=True) - psiss, phiss = _shift(psis, phis) - data = np.column_stack((phiss,psiss)) - X = StandardScaler().fit_transform(data) - else: - X = pcs + #plt.savefig("alatb-hdbscan-tree.png",dpi=300,transparent=True) - if mcs is None: mcs = 50 - if ms is None: ms = mcs - - while True: - hb = hdbscan.HDBSCAN(min_cluster_size = mcs, min_samples = ms).fit(X)#fit_predict(X) - hb.condensed_tree_.plot(select_clusters=True) - plt.savefig("alatb-hdbscan-tree.png",dpi=300,transparent=True) - - n_micro_clusters = len(set(hb.labels_)) - (1 if -1 in hb.labels_ else 0) - if n_micro_clusters > 0: - print("HDBSCAN mcs value set to %g"%mcs, n_micro_clusters,'clusters.') - break - elif mcs < 400: - mcs += 25 - else: - sys.exit("Cannot found any valid HDBSCAN mcs value") - #n_noise = list(labels).count(-1) - - aaa - ## plot clusters - colors = ['royalblue', 'maroon', 'forestgreen', 'mediumorchid', \ - 'tan', 'deeppink', 'olive', 'goldenrod', 'lightcyan', 'lightgray'] - vectorizer = np.vectorize(lambda x: colors[x % len(colors)]) - fig, ax = plt.subplots(figsize=(7,7)) - assign = hb.labels_ >= 0 - ax.scatter(X[assign,0],X[assign,1], c=hb.labels_[assign]) - ax.set_xlim(-np.pi, np.pi) - ax.set_ylim(-np.pi, np.pi) - plt.savefig('alaTB_hdbscan.png', dpi=300, transparent=True) - - # remove noise from microstate trajectory and apply TBA (Buchete et al. JPCB 2008) - labels = _filter_states(hb.labels_) - - # remove from clusters points with small (<0.1) probability - for i in range(len(labels)): - if hb.probabilities_[i] < 0.1: - labels[i] = -1 - - return labels +# n_micro_clusters = len(set(hb.labels_)) - (1 if -1 in hb.labels_ else 0 +# if n_micro_clusters > 0: +# print("HDBSCAN mcs value set to %g"%mcs, n_micro_clusters,'clusters.') +# break +# elif mcs < 400: +# mcs += 25 +# else: +# sys.exit("Cannot find any valid HDBSCAN mcs value") +# #n_noise = list(labels).count(-1) + +# ## plot clusters +# colors = ['royalblue', 'maroon', 'forestgreen', 'mediumorchid', \ +# 'tan', 'deeppink', 'olive', 'goldenrod', 'lightcyan', 'lightgray'] +# vectorizer = np.vectorize(lambda x: colors[x % len(colors)]) +# fig, ax = plt.subplots(figsize=(7,7)) +# assign = hb.labels_ >= 0 +# ax.scatter(X[assign,0],X[assign,1], c=hb.labels_[assign]) +# ax.set_xlim(-np.pi, np.pi) +# ax.set_ylim(-np.pi, np.pi) +# plt.savefig('alaTB_hdbscan.png', dpi=300, transparent=True) +# +# # remove noise from microstate trajectory and apply TBA (Buchete et al. JPCB 2008) +# labels = _filter_states(hb.labels_) +# +# # remove from clusters points with small (<0.1) probability +# for i in range(len(labels)): +# if hb.probabilities_[i] < 0.1: +# labels[i] = -1 + + return hdb.labels_ def dPCA(angles): """ @@ -419,20 +428,20 @@ def _filter_states(states): return fs def _shift(psi, phi): - psi_s, phi_s = copy.deepcopy(psi), copy.deepcopy(phi) + psi_s, phi_s = copy.deepcopy(phi), copy.deepcopy(psi) for i in range(len(phi_s)): if phi_s[i] < -2: phi_s[i] += 2*np.pi for i in range(len(psi_s)): if psi_s[i] > 2: psi_s[i] -= 2*np.pi - return psi_s, phi_s + return phi_s, psi_s def tica_worker(x,tau,dim=2): - """ + """ Calculate TICA components for features trajectory 'x' with lag time 'tau'. - JCTC Schultze et al. 2021 + Schultze et al. JCTC 2021 Parameters ----------- @@ -452,25 +461,25 @@ def tica_worker(x,tau,dim=2): """ - # ORGANIZAR x de modo que x[0] contiene la lista de los - # valores del primer feature para todos los frames, x[1] + # x[0] contiene la lista de los valores del + # primer feature para todos los frames, x[1] # la lista del segundo feature, etc. - print('tau value for TICA:',tau) - + print('Lag time for TICA:',tau) + # compute mean free x x = meanfree(x) # estimate covariant symmetrized matrices cmat, cmat0 = covmatsym(x,tau) # solve generalized eigenvalue problem - n = len(x) + #n = len(x) evals, evecs = \ - spla.eig(cmat,b=cmat0) + spla.eig(cmat,b=cmat0,left=True,right=False) #spla.eigh(cmat,b=cmat0,eigvals_only=False,subset_by_index=[n-dim,n-1]) return evals, evecs def meanfree(x): - """ + """ Compute mean free coordinates, i.e. with zero time average. @@ -480,9 +489,9 @@ def meanfree(x): return x def covmatsym(x,tau): - """ + """ Build symmetrized covariance - matrices (left). + matrices. """ cmat = np.zeros((len(x),len(x)),float) @@ -502,7 +511,7 @@ def covmatsym(x,tau): return cmat, cmat0 def covmat(x,y,tau): - """ + """ Calculate covariance matrices (right). """ @@ -513,12 +522,12 @@ def covmat(x,y,tau): if i == (len(x)-tau-1): return aux def covmat0(x,y,tau): - """ - Calculate covariance matrices. + """ + Calculate covariance matrices (left). """ if len(x) != len(y): sys.exit('cannot obtain covariance matrices') aux = 0.0 for i,xval in enumerate(x): aux += xval*y[i] + x[i+tau]*y[i+tau] - if i == (len(x)-tau-1): return aux \ No newline at end of file + if i == (len(x)-tau-1): return aux From d2e8d12634533d8783e551e48d726bc1d326ba24 Mon Sep 17 00:00:00 2001 From: imitxelena003 Date: Mon, 28 Mar 2022 10:52:59 +0200 Subject: [PATCH 5/5] see changes marked with ionix --- mastermsm/msm/msm.py | 2 ++ mastermsm/msm/msm_lib.py | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/mastermsm/msm/msm.py b/mastermsm/msm/msm.py index e841c6b..74ffd3c 100644 --- a/mastermsm/msm/msm.py +++ b/mastermsm/msm/msm.py @@ -366,6 +366,8 @@ def do_trans(self, evecs=False, normalize=False): nkeep = len(self.keep_states) keep_states = self.keep_states count = self.count + ###print('ionix keep_states:',keep_states) + ###print('ionix count matrix:',count) self.trans = msm_lib.calc_trans(nkeep, keep_states, count, normalize=normalize) if not evecs: self.tauT, self.peqT = self.calc_eigsT() diff --git a/mastermsm/msm/msm_lib.py b/mastermsm/msm/msm_lib.py index 61c622a..928e171 100644 --- a/mastermsm/msm/msm_lib.py +++ b/mastermsm/msm/msm_lib.py @@ -2,6 +2,7 @@ This file is part of the MasterMSM package. """ +import sys #ionix import copy import numpy as np import networkx as nx @@ -569,7 +570,10 @@ def calc_trans(nkeep=None, keep_states=None, count=None, normalize=False): else: for i in range(nkeep): ni = reduce(lambda x, y: x + y, map(lambda x: - count[keep_states[x]][keep_states[i]], range(nkeep))) + count[x][i], range(nkeep))) + #ionix count[keep_states[x]][keep_states[i]], range(nkeep))) + if ni==0.0: print(count) #ionix + sys.stdout.flush() for j in range(nkeep): trans[j][i] = float(count[keep_states[j]][keep_states[i]])/float(ni) return trans