Source code for lale.lib.sklearn.feature_agglomeration

# Copyright 2019 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sklearn.cluster.hierarchical
import lale.helpers
import lale.operators
import numpy as np

[docs]class FeatureAgglomerationImpl(): def __init__(self, n_clusters=2, affinity='euclidean', memory=None, connectivity=None, compute_full_tree=None, linkage='ward', pooling_func=None): self._hyperparams = { 'n_clusters': n_clusters, 'affinity': affinity, 'memory': memory, 'connectivity': connectivity, 'compute_full_tree': compute_full_tree, 'linkage': linkage, 'pooling_func': pooling_func}
[docs] def fit(self, X, y=None): self._sklearn_model = sklearn.cluster.hierarchical.FeatureAgglomeration(**self._hyperparams) self._sklearn_model.fit(X, y) return self
[docs] def transform(self, X): return self._sklearn_model.transform(X)
_hyperparams_schema = { '$schema': 'http://json-schema.org/draft-04/schema#', 'description': 'Agglomerate features.', 'allOf': [{ 'type': 'object', 'required': ['memory', 'compute_full_tree', 'pooling_func'], 'relevantToOptimizer': ['n_clusters', 'affinity', 'compute_full_tree', 'linkage'], 'additionalProperties': False, 'properties': { 'n_clusters': { 'type': 'integer', 'minimumForOptimizer': 2, 'maximumForOptimizer': 8, 'default': 2, 'description': 'The number of clusters to find.'}, 'affinity': { 'anyOf': [{ 'enum': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine', 'precomputed']}, { 'forOptimizer':False, 'type': 'object' }],#callable 'default': 'euclidean', 'description': 'Metric used to compute the linkage. Can be "euclidean", "l1", "l2",'}, 'memory': { 'anyOf': [{ 'type': 'string'}, { 'forOptimizer':False, 'type': 'object' }, { #object with the joblib.Memory interface 'enum':[None]}], 'default': None, 'description': 'Used to cache the output of the computation of the tree.'}, 'connectivity': { 'anyOf': [{ 'type': 'array', 'items': { 'type': 'array', 'items': { 'type': 'number' }}}, { 'forOptimizer':False, 'type': 'object' #a callable that transforms the data into a connectivity matrix, #such as derived from kneighbors_graph }, {'enum': [None]}], 'default': None, 'description': 'Connectivity matrix. Defines for each feature the neighboring'}, 'compute_full_tree': { 'anyOf':[{ 'type': 'boolean' },{ 'enum':['auto'] }], 'default': 'auto', 'description': 'Stop early the construction of the tree at n_clusters. This is'}, 'linkage': { 'enum': ['ward', 'complete', 'average', 'single'], 'default': 'ward', 'description': 'Which linkage criterion to use. The linkage criterion determines which'}, 'pooling_func': { 'description': 'This combines the values of agglomerated features into a single', 'default': np.mean}, }}, { 'description': 'affinity, if linkage is "ward", only "euclidean" is accepted', 'anyOf': [ { 'type': 'object', 'properties': {'affinity': {'enum': ['euclidean']}}}, { 'type': 'object', 'properties': { 'linkage':{'not': {'enum': ['ward']}}}}]},{ 'description': 'compute_full_tree, useful only when specifying a connectivity matrix', 'anyOf': [ { 'type': 'object', 'properties': {'compute_full_tree': {'not': {'enum': ['True']}}}}, { 'type': 'object', 'properties': { 'connectivity': {'not': {'enum': ['None']}}}}] }, {'description': 'n_clusters must be None if distance_threshold is not None.', 'anyOf': [ { 'type': 'object', 'properties': {'n_clusters': {'enum': ['None']}}}, { 'type': 'object', 'properties': { 'distance_threshold': {'enum': ['None']}}}] }, {'description': 'compute_full_tree must be True if distance_threshold is not None.', 'anyOf': [ { 'type': 'object', 'properties': {'compute_full_tree': {'enum': ['True']}}}, { 'type': 'object', 'properties': { 'distance_threshold': {'enum': ['None']}}}] }], } _input_fit_schema = { '$schema': 'http://json-schema.org/draft-04/schema#', 'description': 'Fit the hierarchical clustering on the data', 'type': 'object', 'required': ['X'], 'properties': { 'X': { 'type': 'array', 'items': { 'type': 'array', 'items': { 'type': 'number'}, }, 'description': 'The data'}, 'y': {'description': 'Ignored'}, }} _input_transform_schema = { '$schema': 'http://json-schema.org/draft-04/schema#', 'description': 'Transform a new matrix using the built clustering', 'type': 'object', 'required': ['X'], 'properties': { 'X': { 'type': 'array', 'items': { 'type': 'array', 'items': { 'type': 'number'}, }, 'description': 'A M by N array of M observations in N dimensions or a length'}, }, } _output_transform_schema = { '$schema': 'http://json-schema.org/draft-04/schema#', 'description': 'The pooled values for each feature cluster.', 'type': 'array', 'items': { 'type': 'array', 'items': { 'type': 'number'}, }, } _combined_schemas = { '$schema': 'http://json-schema.org/draft-04/schema#', 'description': 'Combined schema for expected data and hyperparameters.', 'documentation_url': 'https://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html', 'type': 'object', 'tags': { 'pre': [], 'op': ['transformer'], 'post': []}, 'properties': { 'hyperparams': _hyperparams_schema, 'input_fit': _input_fit_schema, 'input_predict': _input_transform_schema, 'output': _output_transform_schema}, } if (__name__ == '__main__'): lale.helpers.validate_is_schema(_combined_schemas) FeatureAgglomeration = lale.operators.make_operator(FeatureAgglomerationImpl, _combined_schemas)