Source code for lale.datasets.sklearn_to_pandas

# Copyright 2019 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas as pd
import sklearn.datasets
import lale.datasets.data_schemas
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

[docs]def load_iris_df(test_size=0.2):
    iris = sklearn.datasets.load_iris()
    X = iris.data
    y = iris.target
    target_name = 'target'
    X, y = shuffle(iris.data, iris.target, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42)

    X_train_df = pd.DataFrame(X_train, columns = iris.feature_names)
    y_train_df = pd.Series(y_train, name = target_name)

    X_test_df = pd.DataFrame(X_test, columns = iris.feature_names)
    y_test_df = pd.Series(y_test, name = target_name)

    return (X_train_df, y_train_df), (X_test_df, y_test_df)

[docs]def load_digits_df(test_size=0.2, random_state=42):
    digits = sklearn.datasets.load_digits()
    train_X, test_X, train_y, test_y = train_test_split(
        digits.data, digits.target,
        test_size=test_size, random_state=random_state)
    ncols = train_X.shape[1]
    train_nrows, test_nrows = train_X.shape[0], test_X.shape[0]
    feature_names = [f'x{i}' for i in range(ncols)]
    train_X = pd.DataFrame(train_X, columns=feature_names)
    test_X = pd.DataFrame(test_X, columns=feature_names)
    train_y = pd.Series(train_y, name='target')
    test_y = pd.Series(test_y, name='target')
    schema_X = {
      '$schema': 'http://json-schema.org/draft-04/schema#',
      'type': 'array',
      'items': {
        'type': 'array',
        'minItems': ncols, 'maxItems': ncols,
        'items': {
          'type': 'number',
          'minimum': 0, 'maximum': 16}}}
    schema_y = {
      '$schema': 'http://json-schema.org/draft-04/schema#',
      'type': 'array',
      'items': {
        'type': 'integer',
        'minimum': 0, 'maximum': 9}}
    train_nrows, test_nrows = train_X.shape[0], test_X.shape[0]
    train_X = lale.datasets.data_schemas.add_schema(train_X, {
        **schema_X, 'minItems': train_nrows, 'maxItems': train_nrows })
    test_X = lale.datasets.data_schemas.add_schema(test_X, {
        **schema_X, 'minItems': test_nrows, 'maxItems': test_nrows })
    return (train_X, train_y), (test_X, test_y)

[docs]def california_housing_df(test_size=0.2, random_state=42):
    housing = sklearn.datasets.fetch_california_housing()
    train_X, test_X, train_y, test_y = train_test_split(
        housing.data, housing.target,
        test_size=test_size, random_state=random_state)
    train_X = pd.DataFrame(train_X, columns=housing.feature_names)
    test_X = pd.DataFrame(test_X, columns=housing.feature_names)
    train_y = pd.Series(train_y, name='target')
    test_y = pd.Series(test_y, name='target')
    schema_X = {
      '$schema': 'http://json-schema.org/draft-04/schema#',
      'type': 'array',
      'items': {
        'type': 'array', 'minItems': 8, 'maxItems': 8,
        'items': [
          {'description': 'MedInc', 'type': 'number', 'minimum': 0.0},
          {'description': 'HouseAge', 'type': 'number', 'minimum': 0.0},
          {'description': 'AveRooms', 'type': 'number', 'minimum': 0.0},
          {'description': 'AveBedrms', 'type': 'number', 'minimum': 0.0},
          {'description': 'Population', 'type': 'number', 'minimum': 0.0},
          {'description': 'AveOccup', 'type': 'number', 'minimum': 0.0},
          {'description': 'Latitude', 'type': 'number', 'minimum': 0.0},
          {'description': 'Longitude', 'type': 'number'}]}}
    train_nrows, test_nrows = train_X.shape[0], test_X.shape[0]
    train_X = lale.datasets.data_schemas.add_schema(train_X, {
        **schema_X, 'minItems': train_nrows, 'maxItems': train_nrows })
    test_X = lale.datasets.data_schemas.add_schema(test_X, {
        **schema_X, 'minItems': test_nrows, 'maxItems': test_nrows })
    return (train_X, train_y), (test_X, test_y)