Source code for lale.datasets.uci.uci_datasets

# Copyright 2019 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import os
import pandas as pd
import tempfile
import urllib.request
import zipfile
import lale.datasets.data_schemas

download_data_dir = os.path.join(os.path.dirname(__file__), 'download_data')
download_data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases'

[docs]def download(dataset_id, zip_name, contents_files): zip_url = f'{download_data_url}/{dataset_id}/{zip_name}' data_dir = os.path.join(download_data_dir, dataset_id) if not os.path.exists(data_dir): os.makedirs(data_dir) full_file_names = [os.path.join(data_dir, base) for base in contents_files] def all_downloaded(): for full in full_file_names: if not os.path.exists(full): return False return True if not all_downloaded(): with tempfile.NamedTemporaryFile(suffix='.zip') as tmp_zip_file: urllib.request.urlretrieve(zip_url, tmp_zip_file.name) with zipfile.ZipFile(tmp_zip_file.name) as myzip: for i in range(len(contents_files)): full, base = full_file_names[i], contents_files[i] if not os.path.exists(full): myzip.extract(base, data_dir) assert all_downloaded return full_file_names
[docs]def tsv_to_Xy(file_name, target_col, schema_orig): data_all = pd.read_csv(file_name, sep='\t') row_schema_X = [col_schema for col_schema in schema_orig['items']['items'] if col_schema['description'] != target_col] columns_X = [col_schema['description'] for col_schema in row_schema_X] data_X = data_all.loc[:, columns_X] nrows, ncols_X = data_X.shape schema_X = { **schema_orig, 'minItems': nrows, 'maxItems': nrows, 'items': { 'type': 'array', 'minItems': ncols_X, 'maxItems': ncols_X, 'items': row_schema_X}} data_X = lale.datasets.data_schemas.add_schema(data_X, schema_X) row_schema_y = [col_schema for col_schema in schema_orig['items']['items'] if col_schema['description'] == target_col] data_y = data_all[target_col] schema_y = { **schema_orig, 'minItems': nrows, 'maxItems': nrows, 'items': row_schema_y[0]} data_y = lale.datasets.data_schemas.add_schema(data_y, schema_y) return data_X, data_y
[docs]def fetch_drugscom(): files = download('00462', 'drugsCom_raw.zip', ['drugsComTest_raw.tsv', 'drugsComTrain_raw.tsv']) target_col = 'rating' json_schema = { '$schema': 'http://json-schema.org/draft-04/schema#', 'type': 'array', 'items': { 'type': 'array', 'minItems': 6, 'maxItems': 6, 'items': [ {'description': 'drugName', 'type': 'string'}, {'description': 'condition', 'anyOf': [{'type': 'string'}, {'enum': [np.NaN]}]}, {'description': 'review', 'type': 'string'}, {'description': 'rating', 'enum': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]}, {'description': 'date', 'type': 'string'}, {'description': 'usefulCount', 'type': 'integer', 'minimum': 0}]}} test_X, test_y = tsv_to_Xy(files[0], target_col, json_schema) train_X, train_y = tsv_to_Xy(files[1], target_col, json_schema) return train_X, train_y, test_X, test_y