# coding: utf-8
# Copyright (c) Pymatgen Development Team.
# Distributed under the terms of the MIT License.
"""
This module provides classes to perform fitting of molecule with arbitrary
atom orders.
This module is supposed to perform exact comparisons without the atom order
correspondence prerequisite, while molecule_structure_comparator is supposed
to do rough comparisons with the atom order correspondence prerequisite.
The implementation is based on an excellent python package called `rmsd` that
you can find at https://github.com/charnley/rmsd.
"""
__author__ = "Xiaohui Qu, Adam Fekete"
__version__ = "1.0"
__email__ = "xhqu1981@gmail.com"
import abc
import copy
import itertools
import logging
import math
import re
import numpy as np
from monty.dev import requires
from monty.json import MSONable
try:
from openbabel import openbabel as ob
from pymatgen.io.babel import BabelMolAdaptor
except ImportError:
ob = None
from scipy.optimize import linear_sum_assignment
from scipy.spatial.distance import cdist
from pymatgen.core.structure import Molecule # pylint: disable=ungrouped-imports
logger = logging.getLogger(__name__)
[docs]class AbstractMolAtomMapper(MSONable, metaclass=abc.ABCMeta):
"""
Abstract molecular atom order mapping class. A mapping will be able to
find the uniform atom order of two molecules that can pair the
geometrically equivalent atoms.
"""
[docs] @abc.abstractmethod
def get_molecule_hash(self, mol):
"""
Defines a hash for molecules. This allows molecules to be grouped
efficiently for comparison.
Args:
mol: The molecule. OpenBabel OBMol or pymatgen Molecule object
Returns:
A hashable object. Examples can be string formulas, etc.
"""
pass
[docs] @classmethod
def from_dict(cls, d):
"""
Args:
d (): Dict
Returns:
AbstractMolAtomMapper
"""
for trans_modules in ["molecule_matcher"]:
import sys
if sys.version_info > (3, 0):
level = 0 # Python 3.x
else:
level = -1 # Python 2.x
mod = __import__(
"pymatgen.analysis." + trans_modules,
globals(),
locals(),
[d["@class"]],
level,
)
if hasattr(mod, d["@class"]):
class_proxy = getattr(mod, d["@class"])
from_dict_proxy = getattr(class_proxy, "from_dict")
return from_dict_proxy(d)
raise ValueError("Invalid Comparator dict")
[docs]class IsomorphismMolAtomMapper(AbstractMolAtomMapper):
"""
Pair atoms by isomorphism permutations in the OpenBabel::OBAlign class
"""
[docs] def get_molecule_hash(self, mol):
"""
Return inchi as molecular hash
"""
obconv = ob.OBConversion()
obconv.SetOutFormat(str("inchi"))
obconv.AddOption(str("X"), ob.OBConversion.OUTOPTIONS, str("DoNotAddH"))
inchi_text = obconv.WriteString(mol)
match = re.search(r"InChI=(?P<inchi>.+)\n", inchi_text)
return match.group("inchi")
[docs] def as_dict(self):
"""
Returns:
Jsonable dict.
"""
return {
"version": __version__,
"@module": self.__class__.__module__,
"@class": self.__class__.__name__,
}
[docs] @classmethod
def from_dict(cls, d):
"""
Args:
d (dict): Dict representation
Returns:
IsomorphismMolAtomMapper
"""
return IsomorphismMolAtomMapper()
[docs]class InchiMolAtomMapper(AbstractMolAtomMapper):
"""
Pair atoms by inchi labels.
"""
def __init__(self, angle_tolerance=10.0):
"""
Args:
angle_tolerance (float): Angle threshold to assume linear molecule. In degrees.
"""
self._angle_tolerance = angle_tolerance
self._assistant_mapper = IsomorphismMolAtomMapper()
[docs] def as_dict(self):
"""
Returns:
MSONAble dict.
"""
return {
"version": __version__,
"@module": self.__class__.__module__,
"@class": self.__class__.__name__,
"angle_tolerance": self._angle_tolerance,
}
[docs] @classmethod
def from_dict(cls, d):
"""
Args:
d (dict): Dict Representation
Returns:
InchiMolAtomMapper
"""
return InchiMolAtomMapper(angle_tolerance=d["angle_tolerance"])
@staticmethod
def _inchi_labels(mol):
"""
Get the inchi canonical labels of the heavy atoms in the molecule
Args:
mol: The molecule. OpenBabel OBMol object
Returns:
The label mappings. List of tuple of canonical label,
original label
List of equivalent atoms.
"""
obconv = ob.OBConversion()
obconv.SetOutFormat(str("inchi"))
obconv.AddOption(str("a"), ob.OBConversion.OUTOPTIONS)
obconv.AddOption(str("X"), ob.OBConversion.OUTOPTIONS, str("DoNotAddH"))
inchi_text = obconv.WriteString(mol)
match = re.search(
r"InChI=(?P<inchi>.+)\nAuxInfo=.+" r"/N:(?P<labels>[0-9,;]+)/(E:(?P<eq_atoms>[0-9," r";\(\)]*)/)?",
inchi_text,
)
inchi = match.group("inchi")
label_text = match.group("labels")
eq_atom_text = match.group("eq_atoms")
heavy_atom_labels = tuple(int(i) for i in label_text.replace(";", ",").split(","))
eq_atoms = []
if eq_atom_text is not None:
eq_tokens = re.findall(r"\(((?:[0-9]+,)+[0-9]+)\)", eq_atom_text.replace(";", ","))
eq_atoms = tuple(tuple(int(i) for i in t.split(",")) for t in eq_tokens)
return heavy_atom_labels, eq_atoms, inchi
@staticmethod
def _group_centroid(mol, ilabels, group_atoms):
"""
Calculate the centroids of a group atoms indexed by the labels of inchi
Args:
mol: The molecule. OpenBabel OBMol object
ilabel: inchi label map
Returns:
Centroid. Tuple (x, y, z)
"""
c1x, c1y, c1z = 0.0, 0.0, 0.0
for i in group_atoms:
orig_idx = ilabels[i - 1]
oa1 = mol.GetAtom(orig_idx)
c1x += float(oa1.x())
c1y += float(oa1.y())
c1z += float(oa1.z())
num_atoms = len(group_atoms)
c1x /= num_atoms
c1y /= num_atoms
c1z /= num_atoms
return c1x, c1y, c1z
def _virtual_molecule(self, mol, ilabels, eq_atoms):
"""
Create a virtual molecule by unique atoms, the centriods of the
equivalent atoms
Args:
mol: The molecule. OpenBabel OBMol object
ilables: inchi label map
eq_atoms: equivalent atom labels
farthest_group_idx: The equivalent atom group index in which
there is the farthest atom to the centroid
Return:
The virtual molecule
"""
vmol = ob.OBMol()
non_unique_atoms = {a for g in eq_atoms for a in g}
all_atoms = set(range(1, len(ilabels) + 1))
unique_atom_labels = sorted(all_atoms - non_unique_atoms)
# try to align molecules using unique atoms
for i in unique_atom_labels:
orig_idx = ilabels[i - 1]
oa1 = mol.GetAtom(orig_idx)
a1 = vmol.NewAtom()
a1.SetAtomicNum(oa1.GetAtomicNum())
a1.SetVector(oa1.GetVector())
# try to align using centroids of the equivalent atoms
if vmol.NumAtoms() < 3:
for symm in eq_atoms:
c1x, c1y, c1z = self._group_centroid(mol, ilabels, symm)
min_distance = float("inf")
for i in range(1, vmol.NumAtoms() + 1):
va = vmol.GetAtom(i)
distance = math.sqrt((c1x - va.x()) ** 2 + (c1y - va.y()) ** 2 + (c1z - va.z()) ** 2)
if distance < min_distance:
min_distance = distance
if min_distance > 0.2:
a1 = vmol.NewAtom()
a1.SetAtomicNum(9)
a1.SetVector(c1x, c1y, c1z)
return vmol
@staticmethod
def _align_heavy_atoms(mol1, mol2, vmol1, vmol2, ilabel1, ilabel2, eq_atoms):
"""
Align the label of topologically identical atoms of second molecule
towards first molecule
Args:
mol1: First molecule. OpenBabel OBMol object
mol2: Second molecule. OpenBabel OBMol object
vmol1: First virtual molecule constructed by centroids. OpenBabel
OBMol object
vmol2: First virtual molecule constructed by centroids. OpenBabel
OBMol object
ilabel1: inchi label map of the first molecule
ilabel2: inchi label map of the second molecule
eq_atoms: equivalent atom lables
Return:
corrected inchi labels of heavy atoms of the second molecule
"""
nvirtual = vmol1.NumAtoms()
nheavy = len(ilabel1)
for i in ilabel2: # add all heavy atoms
a1 = vmol1.NewAtom()
a1.SetAtomicNum(1)
a1.SetVector(0.0, 0.0, 0.0) # useless, just to pair with vmol2
oa2 = mol2.GetAtom(i)
a2 = vmol2.NewAtom()
a2.SetAtomicNum(1)
# align using the virtual atoms, these atoms are not
# used to align, but match by positions
a2.SetVector(oa2.GetVector())
aligner = ob.OBAlign(False, False)
aligner.SetRefMol(vmol1)
aligner.SetTargetMol(vmol2)
aligner.Align()
aligner.UpdateCoords(vmol2)
canon_mol1 = ob.OBMol()
for i in ilabel1:
oa1 = mol1.GetAtom(i)
a1 = canon_mol1.NewAtom()
a1.SetAtomicNum(oa1.GetAtomicNum())
a1.SetVector(oa1.GetVector())
aligned_mol2 = ob.OBMol()
for i in range(nvirtual + 1, nvirtual + nheavy + 1):
oa2 = vmol2.GetAtom(i)
a2 = aligned_mol2.NewAtom()
a2.SetAtomicNum(oa2.GetAtomicNum())
a2.SetVector(oa2.GetVector())
canon_label2 = list(range(1, nheavy + 1))
for symm in eq_atoms:
for i in symm:
canon_label2[i - 1] = -1
for symm in eq_atoms:
candidates1 = list(symm)
candidates2 = list(symm)
for c2 in candidates2:
distance = 99999.0
canon_idx = candidates1[0]
a2 = aligned_mol2.GetAtom(c2)
for c1 in candidates1:
a1 = canon_mol1.GetAtom(c1)
d = a1.GetDistance(a2)
if d < distance:
distance = d
canon_idx = c1
canon_label2[c2 - 1] = canon_idx
candidates1.remove(canon_idx)
canon_inchi_orig_map2 = list(zip(canon_label2, list(range(1, nheavy + 1)), ilabel2))
canon_inchi_orig_map2.sort(key=lambda m: m[0])
heavy_atom_indices2 = tuple(x[2] for x in canon_inchi_orig_map2)
return heavy_atom_indices2
@staticmethod
def _align_hydrogen_atoms(mol1, mol2, heavy_indices1, heavy_indices2):
"""
Align the label of topologically identical atoms of second molecule
towards first molecule
Args:
mol1: First molecule. OpenBabel OBMol object
mol2: Second molecule. OpenBabel OBMol object
heavy_indices1: inchi label map of the first molecule
heavy_indices2: label map of the second molecule
Return:
corrected label map of all atoms of the second molecule
"""
num_atoms = mol2.NumAtoms()
all_atom = set(range(1, num_atoms + 1))
hydrogen_atoms1 = all_atom - set(heavy_indices1)
hydrogen_atoms2 = all_atom - set(heavy_indices2)
label1 = heavy_indices1 + tuple(hydrogen_atoms1)
label2 = heavy_indices2 + tuple(hydrogen_atoms2)
cmol1 = ob.OBMol()
for i in label1:
oa1 = mol1.GetAtom(i)
a1 = cmol1.NewAtom()
a1.SetAtomicNum(oa1.GetAtomicNum())
a1.SetVector(oa1.GetVector())
cmol2 = ob.OBMol()
for i in label2:
oa2 = mol2.GetAtom(i)
a2 = cmol2.NewAtom()
a2.SetAtomicNum(oa2.GetAtomicNum())
a2.SetVector(oa2.GetVector())
aligner = ob.OBAlign(False, False)
aligner.SetRefMol(cmol1)
aligner.SetTargetMol(cmol2)
aligner.Align()
aligner.UpdateCoords(cmol2)
hydrogen_label2 = []
hydrogen_label1 = list(range(len(heavy_indices1) + 1, num_atoms + 1))
for h2 in range(len(heavy_indices2) + 1, num_atoms + 1):
distance = 99999.0
idx = hydrogen_label1[0]
a2 = cmol2.GetAtom(h2)
for h1 in hydrogen_label1:
a1 = cmol1.GetAtom(h1)
d = a1.GetDistance(a2)
if d < distance:
distance = d
idx = h1
hydrogen_label2.append(idx)
hydrogen_label1.remove(idx)
hydrogen_orig_idx2 = label2[len(heavy_indices2) :]
hydrogen_canon_orig_map2 = list(zip(hydrogen_label2, hydrogen_orig_idx2))
hydrogen_canon_orig_map2.sort(key=lambda m: m[0])
hydrogen_canon_indices2 = [x[1] for x in hydrogen_canon_orig_map2]
canon_label1 = label1
canon_label2 = heavy_indices2 + tuple(hydrogen_canon_indices2)
return canon_label1, canon_label2
@staticmethod
def _get_elements(mol, label):
"""
The the elements of the atoms in the specified order
Args:
mol: The molecule. OpenBabel OBMol object.
label: The atom indices. List of integers.
Returns:
Elements. List of integers.
"""
elements = [int(mol.GetAtom(i).GetAtomicNum()) for i in label]
return elements
def _is_molecule_linear(self, mol):
"""
Is the molecule a linear one
Args:
mol: The molecule. OpenBabel OBMol object.
Returns:
Boolean value.
"""
if mol.NumAtoms() < 3:
return True
a1 = mol.GetAtom(1)
a2 = mol.GetAtom(2)
for i in range(3, mol.NumAtoms() + 1):
angle = float(mol.GetAtom(i).GetAngle(a2, a1))
if angle < 0.0:
angle = -angle
if angle > 90.0:
angle = 180.0 - angle
if angle > self._angle_tolerance:
return False
return True
[docs] def get_molecule_hash(self, mol):
"""
Return inchi as molecular hash
"""
obmol = BabelMolAdaptor(mol).openbabel_mol
inchi = self._inchi_labels(obmol)[2]
return inchi
[docs]class MoleculeMatcher(MSONable):
"""
Class to match molecules and identify whether molecules are the same.
"""
@requires(
ob,
"BabelMolAdaptor requires openbabel to be installed with "
"Python bindings. Please get it at http://openbabel.org "
"(version >=3.0.0).",
)
def __init__(self, tolerance=0.01, mapper=InchiMolAtomMapper()):
"""
Args:
tolerance (float): RMSD difference threshold whether two molecules are
different
mapper (AbstractMolAtomMapper): MolAtomMapper object that is able to map the atoms of two
molecule to uniform order
"""
self._tolerance = tolerance
self._mapper = mapper
[docs] def fit(self, mol1, mol2):
"""
Fit two molecules.
Args:
mol1: First molecule. OpenBabel OBMol or pymatgen Molecule object
mol2: Second molecule. OpenBabel OBMol or pymatgen Molecule object
Returns:
A boolean value indicates whether two molecules are the same.
"""
return self.get_rmsd(mol1, mol2) < self._tolerance
[docs] def get_rmsd(self, mol1, mol2):
"""
Get RMSD between two molecule with arbitrary atom order.
Returns:
RMSD if topology of the two molecules are the same
Infinite if the topology is different
"""
label1, label2 = self._mapper.uniform_labels(mol1, mol2)
if label1 is None or label2 is None:
return float("Inf")
return self._calc_rms(mol1, mol2, label1, label2)
@staticmethod
def _calc_rms(mol1, mol2, clabel1, clabel2):
"""
Calculate the RMSD.
Args:
mol1: The first molecule. OpenBabel OBMol or pymatgen Molecule
object
mol2: The second molecule. OpenBabel OBMol or pymatgen Molecule
object
clabel1: The atom indices that can reorder the first molecule to
uniform atom order
clabel1: The atom indices that can reorder the second molecule to
uniform atom order
Returns:
The RMSD.
"""
obmol1 = BabelMolAdaptor(mol1).openbabel_mol
obmol2 = BabelMolAdaptor(mol2).openbabel_mol
cmol1 = ob.OBMol()
for i in clabel1:
oa1 = obmol1.GetAtom(i)
a1 = cmol1.NewAtom()
a1.SetAtomicNum(oa1.GetAtomicNum())
a1.SetVector(oa1.GetVector())
cmol2 = ob.OBMol()
for i in clabel2:
oa2 = obmol2.GetAtom(i)
a2 = cmol2.NewAtom()
a2.SetAtomicNum(oa2.GetAtomicNum())
a2.SetVector(oa2.GetVector())
aligner = ob.OBAlign(True, False)
aligner.SetRefMol(cmol1)
aligner.SetTargetMol(cmol2)
aligner.Align()
return aligner.GetRMSD()
[docs] def group_molecules(self, mol_list):
"""
Group molecules by structural equality.
Args:
mol_list: List of OpenBabel OBMol or pymatgen objects
Returns:
A list of lists of matched molecules
Assumption: if s1=s2 and s2=s3, then s1=s3
This may not be true for small tolerances.
"""
mol_hash = [(i, self._mapper.get_molecule_hash(m)) for i, m in enumerate(mol_list)]
mol_hash.sort(key=lambda x: x[1])
# Use molecular hash to pre-group molecules.
raw_groups = tuple(tuple(m[0] for m in g) for k, g in itertools.groupby(mol_hash, key=lambda x: x[1]))
group_indices = []
for rg in raw_groups:
mol_eq_test = [
(p[0], p[1], self.fit(mol_list[p[0]], mol_list[p[1]])) for p in itertools.combinations(sorted(rg), 2)
]
mol_eq = {(p[0], p[1]) for p in mol_eq_test if p[2]}
not_alone_mols = set(itertools.chain.from_iterable(mol_eq))
alone_mols = set(rg) - not_alone_mols
group_indices.extend([[m] for m in alone_mols])
while len(not_alone_mols) > 0:
current_group = {not_alone_mols.pop()}
while len(not_alone_mols) > 0:
candidate_pairs = {tuple(sorted(p)) for p in itertools.product(current_group, not_alone_mols)}
mutual_pairs = candidate_pairs & mol_eq
if len(mutual_pairs) == 0:
break
mutual_mols = set(itertools.chain.from_iterable(mutual_pairs))
current_group |= mutual_mols
not_alone_mols -= mutual_mols
group_indices.append(sorted(current_group))
group_indices.sort(key=lambda x: (len(x), -x[0]), reverse=True)
all_groups = [[mol_list[i] for i in g] for g in group_indices]
return all_groups
[docs] def as_dict(self):
"""
Returns:
MSONAble dict.
"""
return {
"version": __version__,
"@module": self.__class__.__module__,
"@class": self.__class__.__name__,
"tolerance": self._tolerance,
"mapper": self._mapper.as_dict(),
}
[docs] @classmethod
def from_dict(cls, d):
"""
Args:
d (dict): Dict representation
Returns:
MoleculeMatcher
"""
return MoleculeMatcher(
tolerance=d["tolerance"],
mapper=AbstractMolAtomMapper.from_dict(d["mapper"]),
)
[docs]class KabschMatcher(MSONable):
"""Molecule matcher using Kabsch algorithm
The Kabsch algorithm capable aligning two molecules by finding the parameters
(translation, rotation) which minimize the root-mean-square-deviation (RMSD) of
two molecules which are topologically (atom types, geometry) similar two each other.
Notes:
When aligning molecules, the atoms of the two molecules **must** be in the same
order for the results to be sensible.
"""
def __init__(self, target: Molecule):
"""Constructor of the matcher object.
Args:
target: a `Molecule` object used as a target during the alignment
"""
self.target = target
[docs] def match(self, p: Molecule):
"""Using the Kabsch algorithm the alignment of two molecules (P, Q)
happens in three steps:
- translate the P and Q into their centroid
- compute of the optimal rotation matrix (U) using Kabsch algorithm
- compute the translation (V) and rmsd
The function returns the rotation matrix (U), translation vector (V),
and RMSD between Q and P', where P' is:
P' = P * U + V
Args:
p: a `Molecule` object what will be matched with the target one.
Returns:
U: Rotation matrix (D,D)
V: Translation vector (D)
RMSD : Root mean squared deviation between P and Q
"""
if self.target.atomic_numbers != p.atomic_numbers:
raise ValueError("The order of the species aren't matching! " "Please try using `PermInvMatcher`.")
p_coord, q_coord = p.cart_coords, self.target.cart_coords
# Both sets of coordinates must be translated first, so that their
# centroid coincides with the origin of the coordinate system.
p_trans, q_trans = p_coord.mean(axis=0), q_coord.mean(axis=0)
p_centroid, q_centroid = p_coord - p_trans, q_coord - q_trans
# The optimal rotation matrix U using Kabsch algorithm
U = self.kabsch(p_centroid, q_centroid)
p_prime_centroid = np.dot(p_centroid, U)
rmsd = np.sqrt(np.mean(np.square(p_prime_centroid - q_centroid)))
V = q_trans - np.dot(p_trans, U)
return U, V, rmsd
[docs] def fit(self, p: Molecule):
"""Rotate and transform `p` molecule according to the best match.
Args:
p: a `Molecule` object what will be matched with the target one.
Returns:
p_prime: Rotated and translated of the `p` `Molecule` object
rmsd: Root-mean-square-deviation between `p_prime` and the `target`
"""
U, V, rmsd = self.match(p)
# Rotate and translate matrix `p` onto the target molecule.
# P' = P * U + V
p_prime = p.copy()
for site in p_prime:
site.coords = np.dot(site.coords, U) + V
return p_prime, rmsd
[docs] @staticmethod
def kabsch(P: np.ndarray, Q: np.ndarray):
"""The Kabsch algorithm is a method for calculating the optimal rotation matrix
that minimizes the root mean squared deviation (RMSD) between two paired sets of points
P and Q, centered around the their centroid.
For more info see:
- http://en.wikipedia.org/wiki/Kabsch_algorithm and
- https://cnx.org/contents/HV-RsdwL@23/Molecular-Distance-Measures
Args:
P: Nx3 matrix, where N is the number of points.
Q: Nx3 matrix, where N is the number of points.
Returns:
U: 3x3 rotation matrix
"""
# Computation of the cross-covariance matrix
C = np.dot(P.T, Q)
# Computation of the optimal rotation matrix
# using singular value decomposition (SVD).
V, S, WT = np.linalg.svd(C)
# Getting the sign of the det(V*Wt) to decide whether
d = np.linalg.det(np.dot(V, WT))
# And finally calculating the optimal rotation matrix R
# we need to correct our rotation matrix to ensure a right-handed coordinate system.
U = np.dot(np.dot(V, np.diag([1, 1, d])), WT)
return U
[docs]class BruteForceOrderMatcher(KabschMatcher):
"""Finding the best match between molecules by selecting molecule order
with the smallest RMSD from all the possible order combinations.
Notes:
When aligning molecules, the atoms of the two molecules **must** have same number
of atoms from the same species.
"""
[docs] def match(self, p: Molecule, ignore_warning=False):
"""Similar as `KabschMatcher.match` but this method also finds the order of
atoms which belongs to the best match.
A `ValueError` will be raised when the total number of possible combinations
become unfeasible (more than a million combination).
Args:
p: a `Molecule` object what will be matched with the target one.
ignore_warning: ignoring error when the number of combination is too large
Returns:
inds: The indices of atoms
U: 3x3 rotation matrix
V: Translation vector
rmsd: Root mean squared deviation between P and Q
"""
q = self.target
if sorted(p.atomic_numbers) != sorted(q.atomic_numbers):
raise ValueError("The number of the same species aren't matching!")
_, count = np.unique(p.atomic_numbers, return_counts=True)
total_permutations = 1
for c in count:
total_permutations *= np.math.factorial(c) # type: ignore
if not ignore_warning and total_permutations > 1_000_000:
raise ValueError(
"The number of all possible permutations "
"({}) is not feasible to run this method!".format(total_permutations)
)
p_coord, q_coord = p.cart_coords, q.cart_coords
p_atoms, q_atoms = np.array(p.atomic_numbers), np.array(q.atomic_numbers)
# Both sets of coordinates must be translated first, so that
# their centroid coincides with the origin of the coordinate system.
p_trans, q_trans = p_coord.mean(axis=0), q_coord.mean(axis=0)
p_centroid, q_centroid = p_coord - p_trans, q_coord - q_trans
# Sort the order of the target molecule by the elements
q_inds = np.argsort(q_atoms)
q_centroid = q_centroid[q_inds]
# Initializing return values
rmsd = np.inf
# Generate all permutation grouped/sorted by the elements
for p_inds_test in self.permutations(p_atoms):
p_centroid_test = p_centroid[p_inds_test]
U_test = self.kabsch(p_centroid_test, q_centroid)
p_centroid_prime_test = np.dot(p_centroid_test, U_test)
rmsd_test = np.sqrt(np.mean(np.square(p_centroid_prime_test - q_centroid)))
if rmsd_test < rmsd:
p_inds, U, rmsd = p_inds_test, U_test, rmsd_test
# Rotate and translate matrix P unto matrix Q using Kabsch algorithm.
# P' = P * U + V
V = q_trans - np.dot(p_trans, U)
# Using the original order of the indices
inds = p_inds[np.argsort(q_inds)]
return inds, U, V, rmsd
[docs] def fit(self, p: Molecule, ignore_warning=False):
"""Order, rotate and transform `p` molecule according to the best match.
A `ValueError` will be raised when the total number of possible combinations
become unfeasible (more than a million combinations).
Args:
p: a `Molecule` object what will be matched with the target one.
ignore_warning: ignoring error when the number of combination is too large
Returns:
p_prime: Rotated and translated of the `p` `Molecule` object
rmsd: Root-mean-square-deviation between `p_prime` and the `target`
"""
inds, U, V, rmsd = self.match(p, ignore_warning=ignore_warning)
p_prime = Molecule.from_sites([p[i] for i in inds])
for site in p_prime:
site.coords = np.dot(site.coords, U) + V
return p_prime, rmsd
[docs] @staticmethod
def permutations(atoms):
"""Generates all the possible permutations of atom order. To achieve better
performance all tha cases where the atoms are different has been ignored.
"""
element_iterators = [itertools.permutations(np.where(atoms == element)[0]) for element in np.unique(atoms)]
for inds in itertools.product(*element_iterators):
yield np.array(list(itertools.chain(*inds)))
[docs]class HungarianOrderMatcher(KabschMatcher):
"""This method pre-aligns the molecules based on their principal inertia
axis and then re-orders the input atom list using the Hungarian method.
Notes:
This method cannot guarantee the best match but is very fast.
When aligning molecules, the atoms of the two molecules **must** have same number
of atoms from the same species.
"""
[docs] def match(self, p: Molecule):
"""Similar as `KabschMatcher.match` but this method also finds the order of
atoms which belongs to the best match.
Args:
p: a `Molecule` object what will be matched with the target one.
Returns:
inds: The indices of atoms
U: 3x3 rotation matrix
V: Translation vector
rmsd: Root mean squared deviation between P and Q
"""
if sorted(p.atomic_numbers) != sorted(self.target.atomic_numbers):
raise ValueError("The number of the same species aren't matching!")
p_coord, q_coord = p.cart_coords, self.target.cart_coords
p_atoms, q_atoms = (
np.array(p.atomic_numbers),
np.array(self.target.atomic_numbers),
)
p_weights = np.array([site.species.weight for site in p])
q_weights = np.array([site.species.weight for site in self.target])
# Both sets of coordinates must be translated first, so that
# their center of mass with the origin of the coordinate system.
p_trans, q_trans = p.center_of_mass, self.target.center_of_mass
p_centroid, q_centroid = p_coord - p_trans, q_coord - q_trans
# Initializing return values
rmsd = np.inf
# Generate all permutation grouped/sorted by the elements
for p_inds_test in self.permutations(p_atoms, p_centroid, p_weights, q_atoms, q_centroid, q_weights):
p_centroid_test = p_centroid[p_inds_test]
U_test = self.kabsch(p_centroid_test, q_centroid)
p_centroid_prime_test = np.dot(p_centroid_test, U_test)
rmsd_test = np.sqrt(np.mean(np.square(p_centroid_prime_test - q_centroid)))
if rmsd_test < rmsd:
inds, U, rmsd = p_inds_test, U_test, rmsd_test
# Rotate and translate matrix P unto matrix Q using Kabsch algorithm.
# P' = P * U + V
V = q_trans - np.dot(p_trans, U)
return inds, U, V, rmsd
[docs] def fit(self, p: Molecule):
"""Order, rotate and transform `p` molecule according to the best match.
Args:
p: a `Molecule` object what will be matched with the target one.
Returns:
p_prime: Rotated and translated of the `p` `Molecule` object
rmsd: Root-mean-square-deviation between `p_prime` and the `target`
"""
inds, U, V, rmsd = self.match(p)
# Translate and rotate `mol1` unto `mol2` using Kabsch algorithm.
p_prime = Molecule.from_sites([p[i] for i in inds])
for site in p_prime:
site.coords = np.dot(site.coords, U) + V
return p_prime, rmsd
[docs] @staticmethod
def permutations(p_atoms, p_centroid, p_weights, q_atoms, q_centroid, q_weights):
"""Generates two possible permutations of atom order. This method uses the principle component
of the inertia tensor to prealign the molecules and hungarian method to determine the order.
There are always two possible permutation depending on the way to pre-aligning the molecules.
Args:
p_atoms: atom numbers
p_centroid: array of atom positions
p_weights: array of atom weights
q_atoms: atom numbers
q_centroid: array of atom positions
q_weights: array of atom weights
Yield:
perm_inds: array of atoms' order
"""
# get the principal axis of P and Q
p_axis = HungarianOrderMatcher.get_principal_axis(p_centroid, p_weights)
q_axis = HungarianOrderMatcher.get_principal_axis(q_centroid, q_weights)
# rotate Q onto P considering that the axis are parallel and antiparallel
U = HungarianOrderMatcher.rotation_matrix_vectors(q_axis, p_axis)
p_centroid_test = np.dot(p_centroid, U)
# generate full view from q shape to fill in atom view on the fly
perm_inds = np.zeros(len(p_atoms), dtype=int)
# Find unique atoms
species = np.unique(p_atoms)
for specie in species:
p_atom_inds = np.where(p_atoms == specie)[0]
q_atom_inds = np.where(q_atoms == specie)[0]
A = q_centroid[q_atom_inds]
B = p_centroid_test[p_atom_inds]
# Perform Hungarian analysis on distance matrix between atoms of 1st
# structure and trial structure
distances = cdist(A, B, "euclidean")
a_inds, b_inds = linear_sum_assignment(distances)
perm_inds[q_atom_inds] = p_atom_inds[b_inds]
yield perm_inds
# rotate Q onto P considering that the axis are parallel and antiparallel
U = HungarianOrderMatcher.rotation_matrix_vectors(q_axis, -p_axis)
p_centroid_test = np.dot(p_centroid, U)
# generate full view from q shape to fill in atom view on the fly
perm_inds = np.zeros(len(p_atoms), dtype=int)
# Find unique atoms
species = np.unique(p_atoms)
for specie in species:
p_atom_inds = np.where(p_atoms == specie)[0]
q_atom_inds = np.where(q_atoms == specie)[0]
A = q_centroid[q_atom_inds]
B = p_centroid_test[p_atom_inds]
# Perform Hungarian analysis on distance matrix between atoms of 1st
# structure and trial structure
distances = cdist(A, B, "euclidean")
a_inds, b_inds = linear_sum_assignment(distances)
perm_inds[q_atom_inds] = p_atom_inds[b_inds]
yield perm_inds
[docs] @staticmethod
def get_principal_axis(coords, weights):
"""Get the molecule's principal axis.
Args:
coords: coordinates of atoms
weights: the weight use for calculating the inertia tensor
Returns:
Array of dim 3 containing the principal axis
"""
Ixx = Iyy = Izz = Ixy = Ixz = Iyz = 0.0
for (x, y, z), wt in zip(coords, weights):
Ixx += wt * (y * y + z * z)
Iyy += wt * (x * x + z * z)
Izz += wt * (x * x + y * y)
Ixy += -wt * x * y
Ixz += -wt * x * z
Iyz += -wt * y * z
inertia_tensor = np.array([[Ixx, Ixy, Ixz], [Ixy, Iyy, Iyz], [Ixz, Iyz, Izz]])
eigvals, eigvecs = np.linalg.eigh(inertia_tensor)
principal_axis = eigvecs[:, 0]
return principal_axis
[docs] @staticmethod
def rotation_matrix_vectors(v1, v2):
"""Returns the rotation matrix that rotates v1 onto v2 using
Rodrigues' rotation formula.
See more: https://math.stackexchange.com/a/476311
Args:
v1: initial vector
v2: target vector
Returns:
3x3 rotation matrix
"""
if np.allclose(v1, v2):
# same direction
return np.eye(3)
if np.allclose(v1, -v2):
# opposite direction: return a rotation of pi around the y-axis
return np.array([[-1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, -1.0]])
v = np.cross(v1, v2)
s = np.linalg.norm(v)
c = np.vdot(v1, v2)
vx = np.array([[0.0, -v[2], v[1]], [v[2], 0.0, -v[0]], [-v[1], v[0], 0.0]])
return np.eye(3) + vx + np.dot(vx, vx) * ((1.0 - c) / (s * s))
[docs]class GeneticOrderMatcher(KabschMatcher):
"""This method was inspired by genetic algorithms and tries to match molecules
based on their already matched fragments.
It uses the fact that when two molecule is matching their sub-structures have to match as well.
The main idea here is that in each iteration (generation) we can check the match of all possible
fragments and ignore those which are not feasible.
Although in the worst case this method has N! complexity (same as the brute force one),
in practice it performs much faster because many of the combination can be eliminated
during the fragment matching.
Notes:
This method very robust and returns with all the possible orders.
There is a well known weakness/corner case: The case when there is
a outlier with large deviation with a small index might be ignored.
This happens due to the nature of the average function
used to calculate the RMSD for the fragments.
When aligning molecules, the atoms of the two molecules **must** have the
same number of atoms from the same species.
"""
def __init__(self, target: Molecule, threshold: float):
"""Constructor of the matcher object.
Args:
target: a `Molecule` object used as a target during the alignment
threshold: value used to match fragments and prune configuration
"""
super().__init__(target)
self.threshold = threshold
self.N = len(target)
[docs] def match(self, p: Molecule):
"""Similar as `KabschMatcher.match` but this method also finds all of the
possible atomic orders according to the `threshold`.
Args:
p: a `Molecule` object what will be matched with the target one.
Returns:
Array of the possible matches where the elements are:
inds: The indices of atoms
U: 3x3 rotation matrix
V: Translation vector
rmsd: Root mean squared deviation between P and Q
"""
out = []
for inds in self.permutations(p):
p_prime = p.copy()
p_prime._sites = [p_prime[i] for i in inds]
U, V, rmsd = super().match(p_prime)
out.append((inds, U, V, rmsd))
return out
[docs] def fit(self, p: Molecule):
"""Order, rotate and transform all of the matched `p` molecule
according to the given `threshold`.
Args:
p: a `Molecule` object what will be matched with the target one.
Returns:
Array of the possible matches where the elements are:
p_prime: Rotated and translated of the `p` `Molecule` object
rmsd: Root-mean-square-deviation between `p_prime` and the `target`
"""
out = []
for inds in self.permutations(p):
p_prime = p.copy()
p_prime._sites = [p_prime[i] for i in inds]
U, V, rmsd = super().match(p_prime)
# Rotate and translate matrix `p` onto the target molecule.
# P' = P * U + V
for site in p_prime:
site.coords = np.dot(site.coords, U) + V
out.append((p_prime, rmsd))
return out
[docs] def permutations(self, p: Molecule):
"""Generates all of possible permutations of atom order according the threshold.
Args:
p: a `Molecule` object what will be matched with the target one.
Returns:
Array of index arrays
"""
# caching atomic numbers and coordinates
p_atoms, q_atoms = p.atomic_numbers, self.target.atomic_numbers
p_coords, q_coords = p.cart_coords, self.target.cart_coords
if sorted(p_atoms) != sorted(q_atoms):
raise ValueError("The number of the same species aren't matching!")
# starting maches (only based on element)
partial_matches = [[j] for j in range(self.N) if p_atoms[j] == q_atoms[0]]
for i in range(1, self.N):
# extending the target fragment with then next atom
f_coords = q_coords[: i + 1]
f_atom = q_atoms[i]
f_trans = f_coords.mean(axis=0)
f_centroid = f_coords - f_trans
matches = []
for indices in partial_matches:
for j in range(self.N):
# skipping if the this index is already matched
if j in indices:
continue
# skipping if they are different species
if p_atoms[j] != f_atom:
continue
inds = indices + [j]
P = p_coords[inds]
# Both sets of coordinates must be translated first, so that
# their centroid coincides with the origin of the coordinate system.
p_trans = P.mean(axis=0)
p_centroid = P - p_trans
# The optimal rotation matrix U using Kabsch algorithm
U = self.kabsch(p_centroid, f_centroid)
p_prime_centroid = np.dot(p_centroid, U)
rmsd = np.sqrt(np.mean(np.square(p_prime_centroid - f_centroid)))
# rejecting if the deviation is too large
if rmsd > self.threshold:
continue
logger.debug("match - rmsd: {}, inds: {}".format(rmsd, inds))
matches.append(inds)
partial_matches = matches
logger.info(
"number of atom in the fragment: {}, " "number of possible matches: {}".format(i + 1, len(matches))
)
return matches