-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathselfmade_clustering.py
More file actions
171 lines (132 loc) · 6.99 KB
/
selfmade_clustering.py
File metadata and controls
171 lines (132 loc) · 6.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
'''
Your own clustering class should inherit from
class ClusteringAlgorithm():
def __init__(self):
"""!
Initialization of the base class
"""
## groups (array)
self._groups=[]
self._sz=-1
## list of the names of the features
self._generated_features={}
def get_groups(self):
""" Retrieve groups from algorithms
Returns:
groups (array): an array filled with cluster/group indices for each smiles
"""
assert len(self._groups)>0, 'groups not initialised, call cluster'
return self._groups
def size(self):
""" Retrieve groups from algorithms
Returns:
sz (int): size of the last given list of smiles
"""
assert len(self._groups)==self.list_sz, 'The size of the given smiles is not equal the generated groups'
return self._sz
def _check_input_smiles(self,smiles:List[str]):
try:
iterator = iter(smiles)
except TypeError:
smiles=[smiles]
#pandas.series check
if hasattr(smiles, 'tolist'):
smiles = smiles.tolist()
assert isinstance(smiles, list)
self._sz=len(smiles)
return smiles
def get_generated_features(self):
""" Retrieve groups from algorithms
eample dictionary contains:
example_dict={ key:{X:X, cid: generator_name }} with key, the key from the feature_generator,
X, the 2d numpy matrix and
generator_name, the name/version of the feature generator.
Returns:
generated_features (dict): return a dictionary with the keys and generated features of the clustering process.
"""
return self._generated_features
def cluster(self,smiles:List[str]):
"""generate the groups from a given list of smiles
Args:
smiles (list[str]): list of smiles (list of strings)
Attributes:
sz (int): length of smiles list
groups (array): an array filled with cluster/group indices for the smiles list
"""
pass
def clear_generated_features(self):
"""clears the generated features
Attributes:
generated_features: set to empty
"""
self._generated_features={}
In essence you only have to implement the init and cluster functionality. The cluster function takes as input a list/series of smiles and sets the
attribute _groups. This is essential. The groups are the assigned cluster indices.
Your code can ignore nan features and assign them to a failed class, the nan features are filtered out when actually training a model.
When generating features you can save them in the feature_generators attribute following the convention
example_dict={ key:{X:X, cid: generator_name }} with key, the key from the feature_generator,
X, the 2d numpy matrix and
generator_name, the name/version of the feature generator.
These are reused when the algorithm is used to define the groups in nested-cross-validation. The code retrieves the matrices and checks that the provided generator name
matches the generator name of the feature generators of the model (using the provided key)
'''
from automol.clustering import ClusteringAlgorithm
from automol.feature_generators import retrieve_default_offline_generators, ECFPGenerator
from typing import List
import numpy as np
class SklearnClusteringForSmiles(ClusteringAlgorithm):
def __init__(self,*, feature_generators: dict= {},used_features:List[str]=None, random_state:int=42,sklearn_estimator=None):
"""initialisation of KmeansForSmiles with provided dictionary of feature generators and list of used features.
If feature generation dictionary is not provided, default public generators are used
If used features is not provided or provided features are not available in the generator dictionary,
Bottleneck is used if present in generation dictionary,
otherwise al generators in the dictionary are used.
Args:
feature_generators(dict): dictionary containing different feature generators
used_features(list[str]): list of keys indicating the used features
"""
super(SklearnClusteringForSmiles, self).__init__()
if not feature_generators or feature_generators is None:
self._feature_generators= retrieve_default_offline_generators(model='ChEMBL', radius=2, nbits=2048)
else:
assert isinstance(feature_generators,dict), 'provided feature generators must be dictionary'
self._feature_generators=feature_generators
if used_features is None or len(used_features)<1:
used_features=['Bottleneck']
self._used_features=[]
for feat in used_features:
if feat in self._feature_generators:
self._used_features.append(feat)
elif feat.startswith('fps'):
splits=feat.split('_')
self._feature_generators[feat]=ECFPGenerator(radius=splits[2], nBits =splits[1])
self._used_features.append(feat)
if len(self._used_features)<1:
if 'Bottleneck' in self._feature_generators:
self._used_features.append('Bottleneck')
else:
for key,item in self._feature_generators.items():
self._used_features.append(key)
self._random_state=random_state
self._estimator=sklearn_estimator
def cluster(self,smiles):
""""generate the groups from a given list of smiles
Args:
smiles (list[str]): list of smiles (list of strings)
Attributes:
sz (int): length of smiles list
groups (array): an array filled with cluster/group indices for the smiles list
generated_features(dict): dictionary with all the generated feature matrices
"""
smiles=self._check_input_smiles(smiles)
self._groups=np.repeat(-1,len(smiles))
X_list=[]
for key in self._used_features:
self._generated_features[key]= {'X': self._feature_generators[key].generate(smiles), 'cid': self._feature_generators[key].get_generator_name() }
X_list.append(self._generated_features[key]['X'])
X_train=np.concatenate( X_list, axis=-1)
indices=np.array(list(range(len(smiles))))
self._groups=np.repeat(-1,len(smiles))
feature_na=np.array([ np.isnan(row).any() for j, row in enumerate(X_train)])
groups=self._estimator.fit_predict(X_train[~feature_na])
self._groups[~feature_na]=groups