from sklearn.decomposition.online_lda import LatentDirichletAllocation as SKLModel
import lale.helpers
import lale.operators
from numpy import nan, inf
[docs]class LatentDirichletAllocationImpl():
def __init__(self, n_components=10, doc_topic_prior=None, topic_word_prior=None, learning_method='batch', learning_decay=0.7, learning_offset=10.0, max_iter=10, batch_size=128, evaluate_every=(- 1), total_samples=1000000.0, perp_tol=0.1, mean_change_tol=0.001, max_doc_update_iter=100, n_jobs=None, verbose=0, random_state=None, n_topics=None):
self._hyperparams = {
'n_components': n_components,
'doc_topic_prior': doc_topic_prior,
'topic_word_prior': topic_word_prior,
'learning_method': learning_method,
'learning_decay': learning_decay,
'learning_offset': learning_offset,
'max_iter': max_iter,
'batch_size': batch_size,
'evaluate_every': evaluate_every,
'total_samples': total_samples,
'perp_tol': perp_tol,
'mean_change_tol': mean_change_tol,
'max_doc_update_iter': max_doc_update_iter,
'n_jobs': n_jobs,
'verbose': verbose,
'random_state': random_state,
'n_topics': n_topics}
[docs] def fit(self, X, y=None):
self._sklearn_model = SKLModel(**self._hyperparams)
if (y is not None):
self._sklearn_model.fit(X, y)
else:
self._sklearn_model.fit(X)
return self
_hyperparams_schema = {
'$schema': 'http://json-schema.org/draft-04/schema#',
'description': 'inherited docstring for LatentDirichletAllocation Latent Dirichlet Allocation with online variational Bayes algorithm',
'allOf': [{
'type': 'object',
'required': ['n_components', 'doc_topic_prior', 'topic_word_prior', 'learning_method', 'learning_decay', 'learning_offset', 'max_iter', 'batch_size', 'evaluate_every', 'total_samples', 'perp_tol', 'mean_change_tol', 'max_doc_update_iter', 'n_jobs', 'verbose', 'random_state', 'n_topics'],
'relevantToOptimizer': ['n_components', 'learning_method', 'max_iter', 'batch_size', 'evaluate_every', 'total_samples', 'max_doc_update_iter'],
'additionalProperties': False,
'properties': {
'n_components': {
'type': 'integer',
'minimumForOptimizer': 2,
'maximumForOptimizer': 256,
'distribution': 'uniform',
'default': 10,
'description': 'Number of topics.'},
'doc_topic_prior': {
'anyOf': [{
'type': 'number'}, {
'enum': [None]}],
'default': None,
'description': 'Prior of document topic distribution `theta`. If the value is None,'},
'topic_word_prior': {
'anyOf': [{
'type': 'number'}, {
'enum': [None]}],
'default': None,
'description': 'Prior of topic word distribution `beta`. If the value is None, defaults'},
'learning_method': {
'enum': ['batch', 'online'],
'default': 'batch',
'description': 'Method used to update `_component`. Only used in `fit` method.'},
'learning_decay': {
'type': 'number',
'default': 0.7,
'description': 'It is a parameter that control learning rate in the online learning'},
'learning_offset': {
'type': 'number',
'default': 10.0,
'description': 'A (positive) parameter that downweights early iterations in online'},
'max_iter': {
'type': 'integer',
'minimumForOptimizer': 10,
'maximumForOptimizer': 1000,
'distribution': 'uniform',
'default': 10,
'description': 'The maximum number of iterations.'},
'batch_size': {
'type': 'integer',
'minimumForOptimizer': 3,
'maximumForOptimizer': 128,
'distribution': 'uniform',
'default': 128,
'description': 'Number of documents to use in each EM iteration. Only used in online'},
'evaluate_every': {
'type': 'integer',
'minimumForOptimizer': (- 1),
'maximumForOptimizer': 0,
'distribution': 'uniform',
'default': (- 1),
'description': 'How often to evaluate perplexity. Only used in `fit` method.'},
'total_samples': {
'anyOf': [{
'type': 'integer',
'forOptimizer': False}, {
'type': 'number',
'minimumForOptimizer': 0.0,
'maximumForOptimizer': 1.0,
'distribution': 'uniform'}],
'default': 1000000.0,
'description': 'Total number of documents. Only used in the `partial_fit` method.'},
'perp_tol': {
'type': 'number',
'default': 0.1,
'description': 'Perplexity tolerance in batch learning. Only used when'},
'mean_change_tol': {
'type': 'number',
'default': 0.001,
'description': 'Stopping tolerance for updating document topic distribution in E-step.'},
'max_doc_update_iter': {
'type': 'integer',
'minimumForOptimizer': 100,
'maximumForOptimizer': 101,
'distribution': 'uniform',
'default': 100,
'description': 'Max number of iterations for updating document topic distribution in'},
'n_jobs': {
'anyOf': [{
'type': 'integer'}, {
'enum': [None]}],
'default': None,
'description': 'The number of jobs to use in the E-step.'},
'verbose': {
'type': 'integer',
'default': 0,
'description': 'Verbosity level.'},
'random_state': {
'anyOf': [{
'type': 'integer'}, {
'type': 'object'}, {
'enum': [None]}],
'default': None,
'description': 'If int, random_state is the seed used by the random number generator;'},
'n_topics': {
'anyOf': [{
'type': 'integer'}, {
'enum': [None]}],
'default': None,
'description': 'This parameter has been renamed to n_components and will'},
}}, {
'description': 'learning_method, only used in fit method',
'anyOf': [{
'type': 'object',
'properties': {
'learning_method': {
'enum': ['batch']},
}}, {
'type': 'object',
'properties': {
'method': {
'enum': ['fit']},
}}]}, {
'description': 'batch_size, only used in online learning',
'anyOf': [{
'type': 'object',
'properties': {
'batch_size': {
'enum': [128]},
}}, {
'type': 'object',
'properties': {
'learning': {
'enum': ['online']},
}}]}, {
'description': 'evaluate_every, only used in fit method',
'anyOf': [{
'type': 'object',
'properties': {
'evaluate_every': {
'enum': [(- 1)]},
}}, {
'type': 'object',
'properties': {
'method': {
'enum': ['fit']},
}}]}, {
'description': 'total_samples, only used in the partial_fit method',
'anyOf': [{
'type': 'object',
'properties': {
'total_samples': {
'enum': [1000000.0]},
}}, {
'type': 'object',
'properties': {
'method': {
'enum': ['partial_fit']},
}}]}, {
'XXX TODO XXX': 'Parameter: perp_tol > only used when evaluate_every is greater than 0'}],
}
_input_fit_schema = {
'$schema': 'http://json-schema.org/draft-04/schema#',
'description': 'Learn model for the data X with variational Bayes method.',
'type': 'object',
'properties': {
'X': {
'anyOf': [{
'type': 'array',
'items': {
'XXX TODO XXX': 'item type'},
'XXX TODO XXX': 'array-like or sparse matrix, shape=(n_samples, n_features)'}, {
'type': 'array',
'items': {
'type': 'array',
'items': {
'type': 'number'},
}}],
'description': 'Document word matrix.'},
'y': {
}},
}
_input_transform_schema = {
'$schema': 'http://json-schema.org/draft-04/schema#',
'description': 'Transform data X according to the fitted model.',
'type': 'object',
'properties': {
'X': {
'anyOf': [{
'type': 'array',
'items': {
'XXX TODO XXX': 'item type'},
'XXX TODO XXX': 'array-like or sparse matrix, shape=(n_samples, n_features)'}, {
'type': 'array',
'items': {
'type': 'array',
'items': {
'type': 'number'},
}}],
'description': 'Document word matrix.'},
},
}
_output_transform_schema = {
'$schema': 'http://json-schema.org/draft-04/schema#',
'description': 'Document topic distribution for X.',
'XXX TODO XXX': 'shape=(n_samples, n_components)',
}
_combined_schemas = {
'$schema': 'http://json-schema.org/draft-04/schema#',
'description': 'Combined schema for expected data and hyperparameters.',
'type': 'object',
'tags': {
'pre': [],
'op': ['transformer'],
'post': []},
'properties': {
'hyperparams': _hyperparams_schema,
'input_fit': _input_fit_schema,
'input_transform': _input_transform_schema,
'output_transform': _output_transform_schema},
}
if (__name__ == '__main__'):
lale.helpers.validate_is_schema(_combined_schemas)
LatentDirichletAllocation = lale.operators.make_operator(LatentDirichletAllocationImpl, _combined_schemas)